note
ELISHEVA
<p>Simple nested language structures can be easily parsed with a loop and a stack. Whenever you encounter the character sequence that marks the start of the nested language/data, you add to the stack. Whenever you encounter the character sequence that marks the end of the nested language/data you pop the stack. It looks something like this:</p>
<p>Note: the <c>/\G..../gc</c> idiom means "start matching where we left off and reset \G to the character after the match". <c>\G</c> means "where we left off". For example <c>qr(\Ga)</c> would require there to be an "a" right where we left off whereas <c>qr(a)</c> would look for the first "a" any place after we left off, even a 1000 characters later.</p>
<code>
use strict;
use warnings;
use Data::Dumper;
my %hData;
my @stack;
my $h=\%hData;
my $buf = '';
my $iPos=0;
while (my $line = <DATA>) {
chomp $line;
$buf .= $line;
#print STDERR "<$buf>\n";
while ( $buf =~ /\s*\(\s*(\w+)\s*=/g) { #get start, e.g. (S=
my $k = $1;
#print STDERR "k=$k stack=" . @stack ." pos=". pos($buf) . "\n";
# decide if what comes after start is nested data (S=(...
# or a key value pair (S=V)
if ( $buf =~ /\G\s*\(/gc) {
#print STDERR "nested data: pushing stack\n";
# we have nested data!
push @stack, $h;
$h = $h->{$k} = {};
# position to just before the ( so we can read in the
# next item.
pos($buf) = pos($buf) - 1;
} elsif ($buf =~ /\G\s*([^)]*)\s*(\))/gc) {
# we have a key value pair, so add it to the hash
# Note: in case there are two values for a key, store
# values in an array
my $v = $1;
if (exists $h->{$k}) {
if ( ref($h->{$k}) eq 'ARRAY') {
push @{$h->{$k}}, $v;
} else {
$h->{$k} = [ $h->{$k}, $v ];
}
} else {
$h->{$k} = $v;
}
}
# look for extra closing ) that signal the end of nested data
while ( $buf =~ /\G\s*\)/gc ) {
#print STDERR "end of nested data: popping stack\n";
$h = pop @stack;
}
# store the position so we can add what is left to the next
# parse buffer if the regex above fails an pos is reset to 0.
$iPos=pos($buf);
}
# get the unparsed tail.
$buf = $iPos < length($buf) ? substr($buf, $iPos) : '';
}
print Data::Dumper->Dump([\%hData]);
print "stack = " . @stack . "\n";
__DATA__
(S=(SN=ac2.bd)
(I1=(IN=s%1)(NM=1)
(HL=(HLD=kkk kjkjk)(ST=abdc)(HI=REM SSS)(H_M=9)(HL=72)(EB=0)(ER=0)(HI=E043-93A-DF0-0AB63E)(PE=aaa)(HN=DEE)(SS=NS)(SED=(APR=(PAD=kkk)(PN=9905)(HH=llkjk))(DD=(LLL=kkk))))
(ppp=1)(RAW=kkk)(DN=kkk)(RIN=ppp))
(PPP=1)
(AA=LLI))
</code>
998258
998258