Your code and the solutions already presented won't handle nested parens correctly. For example,
NOT ("test(s)"[MESH] AND ("A"[MESH] OR "B"[MESH]))
will fail. The solution below works better (although it uses an "experimental" regexp feature.
#/usr/bin/perl
use strict;
use warnings;
my $string = 'NOT ("test(s)"[MESH] AND ("A"[MESH] OR "B"[MESH]))';
my $parens_guts; # Can't combine this line with the next one.
$parens_guts = qr/
(?:
"[^"]*"
|
\( (??{ $parens_guts }) \)
|
[^"()]
)*
/sx;
$string =~ s/
\s* NOT \s*
(?:
"[^"]*"\[MESH\]
|
\( (??{ $parens_guts }) \)
)
//gsx;
print("[$string]$/");
I could write a Parse::RecDescent solution if you don't want to use the "experimental" (??{ ... }).
Update: I went and did the Parse::RecDescent version for fun at lunch.
make_parser.pl -- Run this once to create MeshGrammar.pm
#!/usr/bin/perl
# make_parser.pl
use strict;
use warnings;
use Parse::RecDescent ();
my $grammar = <<'__EOI__';
parse : expr eof { $item[1] }
eof : /^\Z/
expr : expr_(s?) term { [ (map{@$_}@{$item[1]}), $item[2] ] }
expr_ : term binop { [ $item[1], $item[2] ] }
term : unary term { [ $item[1], $item[2] ] }
| /"[^"]*"\[MESH\]/ { [ 'MESH', $item[1] ] }
| '(' expr ')' { [ 'PAREN', $item[2] ] }
binop : 'OR' { $item[1] }
| 'AND' { $item[1] }
| 'NOT' { $item[1] }
# unary : 'NOT' { $item[1] }
# I don't know what to do with it,
# so I'm not supporting it.
unary : { undef }
__EOI__
$::RD_HINT = 1;
# $::RD_TRACE = 1;
rename('MeshGrammar.pm', 'MeshGrammar.pm.bak');
Parse::RecDescent->Precompile($grammar, 'MeshGrammar')
or die("Bad grammar.\n");
remove_not.pl
#!/usr/bin/perl
# remove_not.pl
use strict;
use warnings;
use MeshGrammar ();
sub process_term {
our $output; local *output = \$_[0];
my $term = $_[1];
if ($term->[0] eq 'MESH') {
$output .= $term->[1];
return;
}
if ($term->[0] eq 'PAREN') {
$output .= '(';
process_expr($output, $term->[1]);
$output .= ')';
return;
}
if ($term->[0] eq 'NOT') {
warn("WARNING: Behaviour for unary NOT not defined. Unary NOT no
+t removed.\n");
$output .= $term->[0];
$output .= ' ';
process_term($output, $term->[1]);
return;
}
die(); # Should never reach here.
}
sub process_expr {
our $output; local *output = \$_[0];
my $expr = $_[1];
my $i = 0;
my $n = @$expr;
my $term;
my $op;
$term = $expr->[$i++];
process_term($output, $term);
while ($i != $n) {
$op = $expr->[$i++];
$term = $expr->[$i++];
next if ($op eq 'NOT');
$output .= ' ';
$output .= $op;
$output .= ' ';
process_term($output, $term);
}
}
my $parser = MeshGrammar->new();
# The following can be put in a loop.
my $string = <<'__EOI__';
("Immunologic and Biological Factors"[MESH] OR "Immunosuppressive Agen
+ts"[MESH] OR "Transplantation Immunology"[MESH] OR "Allergy and Immun
+ology"[MESH] OR "Graft vs Host Disease"[MESH]) NOT ("Foo"[MESH] OR "B
+ar"[MESH]) AND ("Kidney Transplantation"[MESH] OR "Liver Transplantat
+ion"[MESH] OR "Heart Transplantation"[MESH]) NOT ("My Term"[MESH] OR
+"Blah"[MESH]) NOT "foobar"[MESH]
__EOI__
my $tree = $parser->parse($string)
or die("Bad text.\n");
# require Data::Dumper;
# print Data::Dumper::Dumper($tree);
my $output = '';
process_expr($output, $tree);
print($output, $/);