use strict; package WordTrie { # "new" creates an empty WordTrie node, which is also a proper # WordTrie object when handled as as the trie's root node. sub new { bless {children => {}, terminates => 0}, shift } # insert a word into the trie sub insert { my $self = shift; for my $word (@_) { $word =~ s/^\s+|\s+$//g; # trim whitespace length($word) or next; my $node = $self; for my $ch (split //, lc $word) { $node = ($node->{children}->{$ch} //= new ref($self)); } $node->{terminates} = 1; } } # insert a list of words loaded from a file by name or GLOB reference # the optional first argument is a whitelisting regex for filtering # the file's words for insertion. sub load { my $self = shift; my $regex = ref($_[0]) eq 'Regexp' ? shift() : undef; for my $arg (@_) { my $fh = ref($arg) ? $arg : undef; $fh or open($fh, $arg) or die "failed to read $arg"; my @list = <$fh>; chomp(@list); defined($regex) and @list = grep { /$regex/ } @list; $self->insert(@list); } } # simple traversal for dumping all of the trie's words sub full_traverse { my $self = shift; my @stack = ([$self]); while(@stack) { my $context = $stack[-1]; my ($node, $word, $child_keys) = @$context; unless($child_keys) { $node->{terminates} and print "$word\n"; $child_keys = $context->[2] = [sort keys %{$node->{children}}]; } if(@$child_keys) { my $ch = shift @$child_keys; push @stack, [$node->{children}->{$ch}, $word . $ch]; } else { pop @stack; } } } # A primary traversal, with secondary traversals for finding pairs of same-length words, # different at some number of character positions, within a given range of counts. # Accepts zero, one or two integer arguments specifying the extremes of an # inclusive range of difference counts. # If only one number is provided, the range includes only one difference count. # If no arguments are given, defaults to a difference count of 1 sub traverse_deltas { my $self = shift; my ($min_diff_count, $max_diff_count) = @_; $min_diff_count //= 1; $max_diff_count //= $min_diff_count; $max_diff_count < $min_diff_count and ($min_diff_count, $max_diff_count) = ($max_diff_count, $min_diff_count); # A secondary traversal does not intersect its primary traversal, so "zero difference" pairs # cannot be found. Throw an error if pairs with zero (or negative!) differences are requested. $min_diff_count <= 0 and die "0 is inside traverse_deltas diffcount range"; # stack[0] saves the states at each node level of a depth-first recursive tree traversal # # stack[1] when the primary traversal finds a word, stack[1] gets a copy of stack[0] # and performs a secondary continuation of the primary traversal, specifically # for finding words whose lengths match the primary word, and with character # difference counts between $min_diff_count and $max_diff_count (inclusive). # Start with one stack containing one element, a new context for the trie's root node. # Each node is a WordTrie object, for simplicity and because no collection metadata # was needed in this demo for a container object to maintain. my @stacks = ([{node => $self}]); while(@stacks) { my $ctx = $stacks[-1][-1]; # get the active context, at the top position of the active stack unless($ctx->{child_keys}) { # if "child_keys" is not defined, this context was just created. # Traversal may return to this node more than once, and an undefined "child_keys" # acts as a flag for triggering one-time actions for this node, # like something that needs to be done once each time a complete word is found. # create "child_keys", an array of single-character indexes into this node's # "children" hash. Each child node represents the appending of its key character # to this context's "word". $ctx->{child_keys} = [sort keys %{$ctx->{node}->{children}}]; # Check if a word ends here. Even if this node terminates a word, it may still # have children. This happens when one word prefixes another longer word. # For example, 'an' could terminate at this node, but if 'and' exists, # it would terminate with this node's child whose key is 'd' if($ctx->{node}->{terminates}) { if(@stacks == 1) { # The primary traversal found a complete word. Now create a secondary # traversal to search for matching words. The secondary traversal continues # from the primary context. Work would be wasted if it started at the beginning, # because each matching word pair would be duplicated. # This copies the primary context stack into a secondary context stack. # The procedure is complicated by the need to make a deep-copy of the "child_keys" arrays, # in order to preserve the primary versions. for my $ctx (@{$stacks[0]}) { # This uses a key/value hash slice, a newish feature push @{$stacks[1]}, { %{$ctx}{qw/node word diff_count/}, child_keys => [@{$ctx->{child_keys}}] }; } } elsif(@{$stacks[-1]} == @{$stacks[-2]}) { # The secondary traversal hit a word whose length is the same as the primary. # Check that the difference count is within the desired range; if so, print the pair. # Pairs are always ordered alphabetically (or however "sort" orders keys), with # the primary word preceding the secondary. if($ctx->{diff_count} >= $min_diff_count && $ctx->{diff_count} <= $max_diff_count) { my ($pri, $sec, $diffs) = ($stacks[-2][-1]->{word}, $ctx->{word}, $ctx->{diff_count}); # This is where one would add custom code for handling each matching pair. # The demo just prints the primary word, the secondary word and their difference count. print " $pri $sec ($diffs)\n"; } } } # If a secondary traversal is being performed, do not traverse deeper than the primary # traversal, because deeper words are all longer than the primary word. @stacks > 1 && @{$stacks[0]} == @{$stacks[1]} and $ctx = undef; } if(!$ctx || !@{$ctx->{child_keys}}) { # if context was cleared (to cancel deeper traversal), # or no children remain to be searched in this context, # remove this context from the top of the active (top) stack # and if the active stack is now empty, remove it from # the stack of context stacks, @stacks. # Primary traversal takes place when @stacks has only one element, # and the main loop ends when @stacks is empty. pop @{$stacks[-1]}; @{$stacks[-1]} or pop @stacks; } else { # Children remain to be searched: get the next child's key character my $ch = shift @{$ctx->{child_keys}}; my $diff_count = 0+$ctx->{diff_count}; if(@stacks > 1) { # For a secondary traversal, check whether the child key # just selected is the same as the primary word's character # at the next depth. If they are different, increment # "diff_count" between this context and the new child my $primary_word = $stacks[-2][@{$stacks[-1]}]->{word}; my ($primary_ch) = $primary_word =~ /(.)$/; $ch ne $primary_ch and ++$diff_count; # If difference count corresponding to this new child is greater # than the allowed maximum, cancel traversal into the child. $diff_count > $max_diff_count and next; } # push a new child context # "node" is this node's child, from its "children" hash, indexed by the next key, stored in $ch # "word" is this context's word, appended with the child's key character # "diff_count" is this node's "diff_count", incremeneted only during a secondary traversal, # when the key character is different than the primary traversal's key character # at the same depth as the child. push @{$stacks[-1]}, { node => $ctx->{node}->{children}->{$ch}, word => $ctx->{word} . $ch, diff_count => $diff_count }; } } } } # Create an empty WordTrie object my $trie = WordTrie->new; # Example which loads words from the __DATA__ section at the bottom of this file. $trie->load(\*DATA); ## Example which loads words from a dictionary file named 'english-words.95' ## This file has words which contain symbols like apostrophes, so ## a whitelisting regex is supplied for keeping just all-alpha words. ## Although the trie stores words in a case-insensitive manner, the loading regex ## is applied to words as they appear in the file, so it must take this into account. # $trie->load(qr/^[a-z]+$/i, 'english-words.95'); ## This prints all words contained in the trie # $trie->full_traverse; # This finds pairs which differ by only one charcter $trie->traverse_deltas(1); ## This finds pairs which differ by between 1 and 3 characters. # $trie->traverse_deltas(1, 3); __DATA__ aardvark aardbark bbardvark bardbark bardvark barddork aaardvark