use Data::Dumper; use Parse::Balanced qw(parse_balanced is_balanced); my $str = "05/04/2010 13:09:45 - A - somebody - ( ( my.my id >= 1 ) ) and ( ( is-relative.to code = 'sister' ) or ( is-relative.to code = 'brother' ) or ( is-mother.to code = 'dog' ) )"; # recursively parse opening and closing parentheses my @tokens; @tokens = ( "(", \@tokens, ")" ); # parse with a single branch my @p = parse_balanced($str, \@tokens); print Dumper \@p; print "String is ", (is_balanced(@p) ? "" : "not "), "balanced.\n"; # break circular reference @tokens = (); __END__ $VAR1 = [ '05/04/2010 13:09:45 - A - somebody - ', [ '(', ' ', [ '(', ' my.my id >= 1 ', ')' ], ' ', ')' ], ' and ', [ '(', ' ', [ '(', ' is-relative.to code = \'sister\' ', ')' ], ' or ', [ '(', ' is-relative.to code = \'brother\' ', ')' ], ' or ', [ '(', ' is-mother.to code = \'dog\' ', ')' ], ' ', ')' ] ]; String is balanced. #### package Parse::Balanced; use warnings; use strict; BEGIN { require Exporter; *import = \&Exporter::import; # just inherit import() only my @ALL = qw(parse_balanced is_balanced); our $VERSION = 1.001; our @EXPORT_OK = ("ALL", @ALL); our %EXPORT_TAGS = (ALL => [ @ALL ]); } sub _is_array { ref($_[0]) && eval { @{ $_[0] } or 1 } } sub _is_ref { UNIVERSAL::isa($_[0], "REF") } sub _is_regex { UNIVERSAL::isa($_[0], "Regexp") } sub _parse_balanced { # setup current tokenizing position my $text = shift; pos($text) = shift; # begin with literal opening token my @parsed = shift; # build token spec tree my @tok_tree = map { my $tok_spec = _is_ref($_) ? ${ $_ } : $_; my $branch_ref = $tok_spec if _is_array($tok_spec); my $tok = $branch_ref ? $tok_spec->[0] : $tok_spec; (defined($tok) && $tok ne "") ? [ _is_regex($tok) ? $tok : quotemeta($tok), $branch_ref, ] : (); } @_; # find end-of-text when ending token is not found push @tok_tree, [ qr/\z/ ]; # build and compile token regex my $tok_regex = join(")|(" => map { $_->[0] } @tok_tree); $tok_regex = qr/($tok_regex)/; # accumulate strings up to tokens my $str = ""; TOKENIZE: while ($text =~ /(.*?)(?:$tok_regex)/sgc) { $str .= $1; # check captured token against token tree for my $i (0 .. $#tok_tree) { # look through $2, $3, $4, ... my $token; { no strict qw(refs); $token = ${ $i + 2 }; } if (defined($token)) { # found ending token or end-of-text if ($i >= $#tok_tree - 1) { push(@parsed, $str) if $str ne ""; push @parsed, $token; last TOKENIZE; } # check if we branched my $branch = $tok_tree[$i]->[1]; if ($branch) { push(@parsed, $str) if $str ne ""; # recursively tokenize new branch my ($pos, $toks) = _parse_balanced( $text, pos($text), $token, @{ $branch }[1 .. $#{ $branch }], ); push @parsed, $toks; pos($text) = $pos; $str = ""; } else { # skip over embedded literal $str .= $token; } next TOKENIZE; } } } return pos($text), \@parsed; } # entry to recursive parsing subroutine _parse_balanced() sub parse_balanced { my $text = shift; my $parsed = _parse_balanced($text, 0, "", @_, qr/\z/); return @{ $parsed }[1 .. $#{ $parsed } - 1]; } # determine if parse was balanced sub is_balanced (\@) { my $is_balanced = 1; my $array_ref = shift; FIND_EMPTY_TOKEN: { my $token = $array_ref->[-1]; if (defined($token) && $token eq "") { # found empty token $is_balanced = 0; last FIND_EMPTY_TOKEN; } # look for next array ref backwards my $j = $#{ $array_ref }; for my $i (0 .. $j) { if (_is_array($array_ref->[$j - $i])) { $array_ref = $array_ref->[$j - $i]; redo FIND_EMPTY_TOKEN; } } } return $is_balanced; } 'Parse::Balanced';