Here's the demo I put together. It still has shortcomings, but I got tired of playing with it, so I thought I'd post it here in case it would be interesting to you or another perlmonk at some time. It has a few shortcomings due to me wanting to keep it a short demo, and I'm not happy about how the abstractions are factored, but I've got other things I want to do, so I'm leaving it as-is for you or anyone to play with. It does, however, demonstrate what I described in my earlier note.
#!env perl
#
# ex_regex_by_char_via_NFA.pl
#
# Quickie hand-rolled state machine for simple regexes
#
# 20200415
#---------- STATE MACHINE CLASSES ------------------------------------
+--------------
# A state is just a node ID, a bag of edges, and a flag marking it as
+a final node.
# Each edge is a list holding a function ref to recognize a character
+and a ref
# to the destination node if the function returns TRUE.
package NFA::State {
use strict; use warnings;
sub _new {
my ($class, $ID) = @_;
my $self = {
ID=>$ID, final=>0, edges=>[],
};
return bless $self, $class;
}
# Set a node as final (true arg) or fetch final flag (no args)
sub is_final {
my ($self, $new_state) = @_;
$self->{final}=$new_state if defined $new_state;
return $self->{final}//0;
}
};
# Toy NFA parser/executor: Hold current state and ref to the NFA desc
+ription.
# You advance the machine by calling accept() with the next character
+to check.
#
# NOTE: No backtracking, no bookkeeping, so it's easy to make machines
+ with
# corresponding strings that should but don't match. It's easy enough
+ to fix,
# but not worthwhile for a trivial demo.
# Example: "(ab*c|c+d?b)" won't match "cabcd".
#
package NFA::Parser {
use strict; use warnings;
sub _new {
my ($class, $nfa) = @_;
return bless {
NFA=>$nfa, # NFA description
cur=>$nfa->start(), # Current state
cnt=>0, # chars read since last in start
}, $class;
}
sub accept {
my ($self, $char) = @_;
my $curN = $self->{cur};
# No reason to continue if we've got a match
return if $curN->is_final();
for my $rE (@{$curN->{edges}}) {
if ($rE->[0]($char)) {
# character matched, advance to new node
$self->{cur} = $rE->[1];
++$self->{cnt};
return;
}
}
# No match found, go back to start
if ($self->{cur} != $self->{NFA}->start()) {
$self->{cur} = $self->{NFA}->start();
$self->{cnt} = 0;
}
}
sub is_final {
my $self = shift;
return $self->{cur}->is_final();
}
sub curID {
my $self = shift;
return $self->{cur}{ID};
}
};
# The machine description that holds the nodes and edges. The parser
+is a
# light wrapper that refers to the NFA so we can have lots of cheap pa
+rsers.
package NFA {
use strict; use warnings;
# Create a new NFA with a default START state
sub new {
my $class = shift;
my $self = bless { states=>{ } }, $class;
$self->new_node('*');
return $self;
};
# Fetch START node
sub start {
my $self = shift;
return $self->{states}{'*'}
}
# create a new node with specified ID
sub new_node {
my ($self, $ID) = @_;
my $n = NFA::State->_new($ID);
$self->{states}{$ID} = $n;
return $n;
}
# create a new edge between nodes for the specified function
sub new_edge {
my ($self, $srcN, $edgeFn, $dstN) = @_;
push @{$srcN->{edges}}, [ $edgeFn, $dstN ];
}
# return a parser for the NFA
sub parser {
my $nfa = shift;
return NFA::Parser->_new($nfa);
}
};
use strict;
use warnings;
use v5.26;
use Data::Dump 'pp';
use constant BUFFER_FULL=>70;
# seed random number generator so we can always have same sequence for
+ DBG
srand(5);
# NFA to accept (ab*c|c+d?b):
#
# /'b'\
# v /
# 'a'-->(1)--'b'-->(2)--'c'-->((3))
# / \---------'c'--------^
# /
# (*0*) /'c'
# \ v /
# \'c'--->(4)--'d'-->(5)--'b'-->((6))
# \-------'b'----------^
#
my $NFA = NFA->new();
my %edgeFuncs; # Edge recognizer functions
$edgeFuncs{$_} = gen_edgeFn_char($_) for qw(a b c d);
my @nodes = ($NFA->start()); # Nodes as labeled in above diagram
$nodes[$_] = $NFA->new_node($_) for 1 .. 6;
$nodes[3]->is_final(1);
$nodes[6]->is_final(1);
# Connect it all up
for my $r ( # ( [ sourceNode, character, destNode ]... )
[ 0, 'a', 1 ], [ 0, 'c', 4 ],
[ 1, 'b', 2 ], [ 1, 'c', 3 ],
[ 2, 'b', 2 ], [ 2, 'c', 3 ],
[ 4, 'c', 4 ], [ 4, 'd', 5 ], [ 4, 'b', 6 ],
[ 5, 'b', 6 ],
) {
$NFA->new_edge( $nodes[$r->[0]], $edgeFuncs{$r->[1]}, $nodes[$r->[
+2]] );
}
# Build 10 string streams
my @strs = map {
{
ID=>$_, pos=>-1, string=>init_string(),
parser=>$NFA->parser(), match=>0
}
} 1 .. 10;
# Some known match strings for the demo
$strs[2]{string} = "XYZabbbbbcdEF";
$strs[3]{string} = "XYZacDEF";
$strs[4]{string} = "XYZcccccdbBBBBCDEF";
$strs[5]{string} = "XYZccccbDEF";
# OK, everything is set up!
#####
# Now process the string streams until all are complete
#####
my $passes = 0;
my $active_streams = @strs;
while ($active_streams) {
print "\n\n===== PASS: ", ++$passes, " (active=$active_streams)\n\
+n";
$active_streams = 0;
#++$dbg = 0;
last if $passes > 1000;
#print "\n\n===== PASS: $dbg\n\n";
# Give each string a little processing
for my $i (0 .. $#strs) {
next if exists $strs[$i]{KILL}; # Ignore completed streams
my $rS = $strs[$i];
my $l = length($rS->{string});
++$active_streams;
printf "S[%2u]: ID:$rS->{ID}, p:$rS->{pos}, l:$l, st:%s)\n",
$i, $rS->{parser}->curID();
# If we have data waiting, advance the state machine
if ($rS->{pos} < $l) {
++$rS->{pos};
my $char = substr($rS->{string}, $rS->{pos}, 1);
$rS->{parser}->accept($char);
# Display the string, current position and possible match
+chars
print " '$rS->{string}' ch='$char'->ST=",
$rS->{parser}->curID(), "\n";
if ($rS->{parser}->is_final()) {
# Match: arrows indicate match, last arrow is current
+position
print " " x (5 + ($rS->{pos} - $rS->{parser}{cnt} + 1)
+);
print "^" x $rS->{parser}{cnt}, "=match\n";
$rS->{parser}{cur} = $rS->{parser}{NFA}->start(); # pr
+ep for next
}
else {
# No match: just show current position
print " " x (5+$rS->{pos}), "^=pos\n";
}
}
elsif ($l >= BUFFER_FULL) {
# No data waiting *and* buffer is full, so we're done
print " '$rS->{string}' ** COMPLETED **\n";
$rS->{KILL}=1;
}
else {
print " '$rS->{string}' --waiting for data--\n";
}
# Do we have room in the buffer?
if ($l < BUFFER_FULL) {
# Yes, add data to the buffer (sometimes)
if (0.1 > rand) {
my $extend = init_string();
$rS->{string} .= $extend;
print " '$rS->{string}'\n";
print " " x (5 + $l), "^" x (length $extend), "=new da
+ta\n";
}
}
}
$active_streams = scalar grep { ! exists $_->{KILL} } @strs;
}
# Return a random character from 'a' to 'f'
sub a_char {
state @alphabet = ('a', 'b', 'c', 'd', 'e', 'f');
return $alphabet[@alphabet*rand];
}
# Return a string of 1..9 characters
sub init_string {
return join("", map { a_char() } 1 .. 3+6*rand);
}
# Return a function that recognizes the specified character
sub gen_edgeFn_char {
my $char = shift;
return sub {
my $nextChar = shift;
return $char eq $nextChar;
}
}
When it runs, it generates over 4K lines of output with a bunch of detail about what it's doing. I've selected pass 75 to include here because it shows all possible actions a stream may perform in the demo:
- Stream 0 both advances a character and receives new data.
- Stream 1 is stalled, waiting for new data to arrive.
- Stream 2 just advances a character.
- Stream 4 has finished processing and it's "full", so it terminates.
===== PASS: 75 (active=10)
S[ 0]: ID:1, p:50, l:63, st:*)
'bebdfbbcbcbbccdecbbcbcfcdabccecadadcaaeadcafefccbceceadefaebabf'
+ch='c'->ST=4
^=pos
'bebdfbbcbcbbccdecbbcbcfcdabccecadadcaaeadcafefccbceceadefaebabfbd
+ecfec'
^^
+^^^^^=new data
S[ 1]: ID:2, p:42, l:42, st:*)
'bffceddcaeecccfbabbeccbaeaddaeaffabeafadfd' --waiting for data--
S[ 2]: ID:3, p:53, l:60, st:*)
'XYZabbbbbcdEFaceaedebcbbdaeaeedebccdfefaeafadcbfffdbaeecdeca' ch=
+'e'->ST=*
^=pos
S[ 3]: ID:4, p:40, l:43, st:4)
'XYZacDEFebceeabbccbaabaadaedfcddfabdcdebcbd' ch='b'->ST=6
^^=match
S[ 4]: ID:5, p:73, l:73, st:*)
'XYZcccccdbBBBBCDEFfbdcfdafaceeafdccaefffcdccbebfebeecfeebdadbecdb
+aadcaafa' ** COMPLETED **
S[ 5]: ID:6, p:34, l:35, st:*)
'XYZccccbDEFaceafabddbdfebededbafadb' ch=''->ST=*
^=pos
S[ 6]: ID:7, p:40, l:40, st:*)
'fdafbfedcaafecaeabfecffbbdbbadccefcddacb' --waiting for data--
S[ 7]: ID:8, p:62, l:67, st:*)
'faaccfeabcabeddecaccdffebeaefffaacacafcddfbdadbdfdeecbdbeebdffbbb
+be' ch='b'->ST=*
^=
+pos
S[ 8]: ID:9, p:44, l:47, st:*)
'aeefaceaccafeffecedaafebbeeedadcfecbccaacdfcdba' ch='b'->ST=*
^=pos
S[ 9]: ID:10, p:39, l:39, st:*)
'fcefebdfedadcffcfccfdebbbffcaafcbdfaffc' --waiting for data--