A rather obscure module I wrote some time ago experiments with using complex data structures to define how a string is to be parsed in a balanced way.
A list of tokens describes the parsing behavior. Each token could either be a string, a regexp, or an arrayref of tokens. The arrayref is used to define a "balance branch" where its first token is searched for in order to enter the branch, and its last token is searched for to leave the branch. Any string/regexp token in between is skipped over as an "escaped" literal.
use Data::Dumper;
use Parse::Balanced qw(parse_balanced is_balanced);
my $str = "05/04/2010 13:09:45 - A - somebody - ( ( my.my id >= 1 ) )
+and ( ( is-relative.to code = 'sister' ) or ( is-relative.to code = '
+brother' ) or ( is-mother.to code = 'dog' ) )";
# recursively parse opening and closing parentheses
my @tokens;
@tokens = ( "(", \@tokens, ")" );
# parse with a single branch
my @p = parse_balanced($str, \@tokens);
print Dumper \@p;
print "String is ", (is_balanced(@p) ? "" : "not "), "balanced.\n";
# break circular reference
@tokens = ();
__END__
$VAR1 = [
'05/04/2010 13:09:45 - A - somebody - ',
[
'(',
' ',
[
'(',
' my.my id >= 1 ',
')'
],
' ',
')'
],
' and ',
[
'(',
' ',
[
'(',
' is-relative.to code = \'sister\' ',
')'
],
' or ',
[
'(',
' is-relative.to code = \'brother\' ',
')'
],
' or ',
[
'(',
' is-mother.to code = \'dog\' ',
')'
],
' ',
')'
]
];
String is balanced.
And here's the module:
package Parse::Balanced;
use warnings;
use strict;
BEGIN {
require Exporter;
*import = \&Exporter::import; # just inherit import() only
my @ALL = qw(parse_balanced is_balanced);
our $VERSION = 1.001;
our @EXPORT_OK = ("ALL", @ALL);
our %EXPORT_TAGS = (ALL => [ @ALL ]);
}
sub _is_array { ref($_[0]) && eval { @{ $_[0] } or 1 } }
sub _is_ref { UNIVERSAL::isa($_[0], "REF") }
sub _is_regex { UNIVERSAL::isa($_[0], "Regexp") }
sub _parse_balanced {
# setup current tokenizing position
my $text = shift;
pos($text) = shift;
# begin with literal opening token
my @parsed = shift;
# build token spec tree
my @tok_tree = map {
my $tok_spec = _is_ref($_) ? ${ $_ } : $_;
my $branch_ref = $tok_spec if _is_array($tok_spec);
my $tok = $branch_ref ? $tok_spec->[0] : $tok_spec;
(defined($tok) && $tok ne "")
? [
_is_regex($tok) ? $tok : quotemeta($tok),
$branch_ref,
]
: ();
} @_;
# find end-of-text when ending token is not found
push @tok_tree, [ qr/\z/ ];
# build and compile token regex
my $tok_regex = join(")|(" => map { $_->[0] } @tok_tree);
$tok_regex = qr/($tok_regex)/;
# accumulate strings up to tokens
my $str = "";
TOKENIZE:
while ($text =~ /(.*?)(?:$tok_regex)/sgc)
{
$str .= $1;
# check captured token against token tree
for my $i (0 .. $#tok_tree)
{
# look through $2, $3, $4, ...
my $token;
{
no strict qw(refs);
$token = ${ $i + 2 };
}
if (defined($token))
{
# found ending token or end-of-text
if ($i >= $#tok_tree - 1)
{
push(@parsed, $str) if $str ne "";
push @parsed, $token;
last TOKENIZE;
}
# check if we branched
my $branch = $tok_tree[$i]->[1];
if ($branch)
{
push(@parsed, $str) if $str ne "";
# recursively tokenize new branch
my ($pos, $toks) = _parse_balanced(
$text,
pos($text),
$token,
@{ $branch }[1 .. $#{ $branch }],
);
push @parsed, $toks;
pos($text) = $pos;
$str = "";
}
else
{
# skip over embedded literal
$str .= $token;
}
next TOKENIZE;
}
}
}
return pos($text), \@parsed;
}
# entry to recursive parsing subroutine _parse_balanced()
sub parse_balanced {
my $text = shift;
my $parsed = _parse_balanced($text, 0, "", @_, qr/\z/);
return @{ $parsed }[1 .. $#{ $parsed } - 1];
}
# determine if parse was balanced
sub is_balanced (\@) {
my $is_balanced = 1;
my $array_ref = shift;
FIND_EMPTY_TOKEN:
{
my $token = $array_ref->[-1];
if (defined($token) && $token eq "")
{
# found empty token
$is_balanced = 0;
last FIND_EMPTY_TOKEN;
}
# look for next array ref backwards
my $j = $#{ $array_ref };
for my $i (0 .. $j)
{
if (_is_array($array_ref->[$j - $i]))
{
$array_ref = $array_ref->[$j - $i];
redo FIND_EMPTY_TOKEN;
}
}
}
return $is_balanced;
}
'Parse::Balanced';
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.