Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Re: Growing strings in search

by roboticus (Chancellor)
on Apr 16, 2020 at 19:33 UTC ( #11115646=note: print w/replies, xml ) Need Help??


in reply to Growing strings in search

b4swine:

Here's the demo I put together. It still has shortcomings, but I got tired of playing with it, so I thought I'd post it here in case it would be interesting to you or another perlmonk at some time. It has a few shortcomings due to me wanting to keep it a short demo, and I'm not happy about how the abstractions are factored, but I've got other things I want to do, so I'm leaving it as-is for you or anyone to play with. It does, however, demonstrate what I described in my earlier note.

Code:

#!env perl # # ex_regex_by_char_via_NFA.pl # # Quickie hand-rolled state machine for simple regexes # # 20200415 #---------- STATE MACHINE CLASSES ------------------------------------ +-------------- # A state is just a node ID, a bag of edges, and a flag marking it as +a final node. # Each edge is a list holding a function ref to recognize a character +and a ref # to the destination node if the function returns TRUE. package NFA::State { use strict; use warnings; sub _new { my ($class, $ID) = @_; my $self = { ID=>$ID, final=>0, edges=>[], }; return bless $self, $class; } # Set a node as final (true arg) or fetch final flag (no args) sub is_final { my ($self, $new_state) = @_; $self->{final}=$new_state if defined $new_state; return $self->{final}//0; } }; # Toy NFA parser/executor: Hold current state and ref to the NFA desc +ription. # You advance the machine by calling accept() with the next character +to check. # # NOTE: No backtracking, no bookkeeping, so it's easy to make machines + with # corresponding strings that should but don't match. It's easy enough + to fix, # but not worthwhile for a trivial demo. # Example: "(ab*c|c+d?b)" won't match "cabcd". # package NFA::Parser { use strict; use warnings; sub _new { my ($class, $nfa) = @_; return bless { NFA=>$nfa, # NFA description cur=>$nfa->start(), # Current state cnt=>0, # chars read since last in start }, $class; } sub accept { my ($self, $char) = @_; my $curN = $self->{cur}; # No reason to continue if we've got a match return if $curN->is_final(); for my $rE (@{$curN->{edges}}) { if ($rE->[0]($char)) { # character matched, advance to new node $self->{cur} = $rE->[1]; ++$self->{cnt}; return; } } # No match found, go back to start if ($self->{cur} != $self->{NFA}->start()) { $self->{cur} = $self->{NFA}->start(); $self->{cnt} = 0; } } sub is_final { my $self = shift; return $self->{cur}->is_final(); } sub curID { my $self = shift; return $self->{cur}{ID}; } }; # The machine description that holds the nodes and edges. The parser +is a # light wrapper that refers to the NFA so we can have lots of cheap pa +rsers. package NFA { use strict; use warnings; # Create a new NFA with a default START state sub new { my $class = shift; my $self = bless { states=>{ } }, $class; $self->new_node('*'); return $self; }; # Fetch START node sub start { my $self = shift; return $self->{states}{'*'} } # create a new node with specified ID sub new_node { my ($self, $ID) = @_; my $n = NFA::State->_new($ID); $self->{states}{$ID} = $n; return $n; } # create a new edge between nodes for the specified function sub new_edge { my ($self, $srcN, $edgeFn, $dstN) = @_; push @{$srcN->{edges}}, [ $edgeFn, $dstN ]; } # return a parser for the NFA sub parser { my $nfa = shift; return NFA::Parser->_new($nfa); } }; use strict; use warnings; use v5.26; use Data::Dump 'pp'; use constant BUFFER_FULL=>70; # seed random number generator so we can always have same sequence for + DBG srand(5); # NFA to accept (ab*c|c+d?b): # # /'b'\ # v / # 'a'-->(1)--'b'-->(2)--'c'-->((3)) # / \---------'c'--------^ # / # (*0*) /'c' # \ v / # \'c'--->(4)--'d'-->(5)--'b'-->((6)) # \-------'b'----------^ # my $NFA = NFA->new(); my %edgeFuncs; # Edge recognizer functions $edgeFuncs{$_} = gen_edgeFn_char($_) for qw(a b c d); my @nodes = ($NFA->start()); # Nodes as labeled in above diagram $nodes[$_] = $NFA->new_node($_) for 1 .. 6; $nodes[3]->is_final(1); $nodes[6]->is_final(1); # Connect it all up for my $r ( # ( [ sourceNode, character, destNode ]... ) [ 0, 'a', 1 ], [ 0, 'c', 4 ], [ 1, 'b', 2 ], [ 1, 'c', 3 ], [ 2, 'b', 2 ], [ 2, 'c', 3 ], [ 4, 'c', 4 ], [ 4, 'd', 5 ], [ 4, 'b', 6 ], [ 5, 'b', 6 ], ) { $NFA->new_edge( $nodes[$r->[0]], $edgeFuncs{$r->[1]}, $nodes[$r->[ +2]] ); } # Build 10 string streams my @strs = map { { ID=>$_, pos=>-1, string=>init_string(), parser=>$NFA->parser(), match=>0 } } 1 .. 10; # Some known match strings for the demo $strs[2]{string} = "XYZabbbbbcdEF"; $strs[3]{string} = "XYZacDEF"; $strs[4]{string} = "XYZcccccdbBBBBCDEF"; $strs[5]{string} = "XYZccccbDEF"; # OK, everything is set up! ##### # Now process the string streams until all are complete ##### my $passes = 0; my $active_streams = @strs; while ($active_streams) { print "\n\n===== PASS: ", ++$passes, " (active=$active_streams)\n\ +n"; $active_streams = 0; #++$dbg = 0; last if $passes > 1000; #print "\n\n===== PASS: $dbg\n\n"; # Give each string a little processing for my $i (0 .. $#strs) { next if exists $strs[$i]{KILL}; # Ignore completed streams my $rS = $strs[$i]; my $l = length($rS->{string}); ++$active_streams; printf "S[%2u]: ID:$rS->{ID}, p:$rS->{pos}, l:$l, st:%s)\n", $i, $rS->{parser}->curID(); # If we have data waiting, advance the state machine if ($rS->{pos} < $l) { ++$rS->{pos}; my $char = substr($rS->{string}, $rS->{pos}, 1); $rS->{parser}->accept($char); # Display the string, current position and possible match +chars print " '$rS->{string}' ch='$char'->ST=", $rS->{parser}->curID(), "\n"; if ($rS->{parser}->is_final()) { # Match: arrows indicate match, last arrow is current +position print " " x (5 + ($rS->{pos} - $rS->{parser}{cnt} + 1) +); print "^" x $rS->{parser}{cnt}, "=match\n"; $rS->{parser}{cur} = $rS->{parser}{NFA}->start(); # pr +ep for next } else { # No match: just show current position print " " x (5+$rS->{pos}), "^=pos\n"; } } elsif ($l >= BUFFER_FULL) { # No data waiting *and* buffer is full, so we're done print " '$rS->{string}' ** COMPLETED **\n"; $rS->{KILL}=1; } else { print " '$rS->{string}' --waiting for data--\n"; } # Do we have room in the buffer? if ($l < BUFFER_FULL) { # Yes, add data to the buffer (sometimes) if (0.1 > rand) { my $extend = init_string(); $rS->{string} .= $extend; print " '$rS->{string}'\n"; print " " x (5 + $l), "^" x (length $extend), "=new da +ta\n"; } } } $active_streams = scalar grep { ! exists $_->{KILL} } @strs; } # Return a random character from 'a' to 'f' sub a_char { state @alphabet = ('a', 'b', 'c', 'd', 'e', 'f'); return $alphabet[@alphabet*rand]; } # Return a string of 1..9 characters sub init_string { return join("", map { a_char() } 1 .. 3+6*rand); } # Return a function that recognizes the specified character sub gen_edgeFn_char { my $char = shift; return sub { my $nextChar = shift; return $char eq $nextChar; } }

Sample output:

When it runs, it generates over 4K lines of output with a bunch of detail about what it's doing. I've selected pass 75 to include here because it shows all possible actions a stream may perform in the demo:

  • Stream 0 both advances a character and receives new data.
  • Stream 1 is stalled, waiting for new data to arrive.
  • Stream 2 just advances a character.
  • Stream 4 has finished processing and it's "full", so it terminates.
===== PASS: 75 (active=10) S[ 0]: ID:1, p:50, l:63, st:*) 'bebdfbbcbcbbccdecbbcbcfcdabccecadadcaaeadcafefccbceceadefaebabf' +ch='c'->ST=4 ^=pos 'bebdfbbcbcbbccdecbbcbcfcdabccecadadcaaeadcafefccbceceadefaebabfbd +ecfec' ^^ +^^^^^=new data S[ 1]: ID:2, p:42, l:42, st:*) 'bffceddcaeecccfbabbeccbaeaddaeaffabeafadfd' --waiting for data-- S[ 2]: ID:3, p:53, l:60, st:*) 'XYZabbbbbcdEFaceaedebcbbdaeaeedebccdfefaeafadcbfffdbaeecdeca' ch= +'e'->ST=* ^=pos S[ 3]: ID:4, p:40, l:43, st:4) 'XYZacDEFebceeabbccbaabaadaedfcddfabdcdebcbd' ch='b'->ST=6 ^^=match S[ 4]: ID:5, p:73, l:73, st:*) 'XYZcccccdbBBBBCDEFfbdcfdafaceeafdccaefffcdccbebfebeecfeebdadbecdb +aadcaafa' ** COMPLETED ** S[ 5]: ID:6, p:34, l:35, st:*) 'XYZccccbDEFaceafabddbdfebededbafadb' ch=''->ST=* ^=pos S[ 6]: ID:7, p:40, l:40, st:*) 'fdafbfedcaafecaeabfecffbbdbbadccefcddacb' --waiting for data-- S[ 7]: ID:8, p:62, l:67, st:*) 'faaccfeabcabeddecaccdffebeaefffaacacafcddfbdadbdfdeecbdbeebdffbbb +be' ch='b'->ST=* ^= +pos S[ 8]: ID:9, p:44, l:47, st:*) 'aeefaceaccafeffecedaafebbeeedadcfecbccaacdfcdba' ch='b'->ST=* ^=pos S[ 9]: ID:10, p:39, l:39, st:*) 'fcefebdfedadcffcfccfdebbbffcaafcbdfaffc' --waiting for data--

...roboticus

When your only tool is a hammer, all problems look like your thumb.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://11115646]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others about the Monastery: (7)
As of 2020-09-29 11:33 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    If at first I donít succeed, I Ö










    Results (146 votes). Check out past polls.

    Notices?