Here's a trie solution which probably does everything you have asked about. Its performance is suboptimal in that it is a standard trie, where most "trie" applications would opt for one of the optimized equivalents. Also, this has a lot of confusing nested structure, as well as two traversal context stacks, so for clarity, hashes with named elements are used where arrays of fixed-index fields would be faster.
It still runs pretty fast with SCOWL's big English word list, though.
Also, the code's size is bloated by excessive commenting.
use strict;
package WordTrie {
# "new" creates an empty WordTrie node, which is also a proper
# WordTrie object when handled as as the trie's root node.
sub new { bless {children => {}, terminates => 0}, shift }
# insert a word into the trie
sub insert {
my $self = shift;
for my $word (@_) {
$word =~ s/^\s+|\s+$//g; # trim whitespace
length($word) or next;
my $node = $self;
for my $ch (split //, lc $word) {
$node = ($node->{children}->{$ch} //= new ref($self));
}
$node->{terminates} = 1;
} }
# insert a list of words loaded from a file by name or GLOB refere
+nce
# the optional first argument is a whitelisting regex for filterin
+g
# the file's words for insertion.
sub load {
my $self = shift;
my $regex = ref($_[0]) eq 'Regexp' ? shift() : undef;
for my $arg (@_) {
my $fh = ref($arg) ? $arg : undef;
$fh or open($fh, $arg) or die "failed to read $arg";
my @list = <$fh>;
chomp(@list);
defined($regex) and @list = grep { /$regex/ } @list;
$self->insert(@list);
} }
# simple traversal for dumping all of the trie's words
sub full_traverse {
my $self = shift;
my @stack = ([$self]);
while(@stack) {
my $context = $stack[-1];
my ($node, $word, $child_keys) = @$context;
unless($child_keys) {
$node->{terminates} and print "$word\n";
$child_keys = $context->[2] = [sort keys %{$node->{chi
+ldren}}];
}
if(@$child_keys) {
my $ch = shift @$child_keys;
push @stack, [$node->{children}->{$ch}, $word . $ch];
} else {
pop @stack;
} } }
# A primary traversal, with secondary traversals for finding pairs
+ of same-length words,
# different at some number of character positions, within a given
+range of counts.
# Accepts zero, one or two integer arguments specifying the extrem
+es of an
# inclusive range of difference counts.
# If only one number is provided, the range includes only one diff
+erence count.
# If no arguments are given, defaults to a difference count of 1
sub traverse_deltas {
my $self = shift;
my ($min_diff_count, $max_diff_count) = @_;
$min_diff_count //= 1;
$max_diff_count //= $min_diff_count;
$max_diff_count < $min_diff_count and ($min_diff_count, $max_d
+iff_count) = ($max_diff_count, $min_diff_count);
# A secondary traversal does not intersect its primary travers
+al, so "zero difference" pairs
# cannot be found. Throw an error if pairs with zero (or nega
+tive!) differences are requested.
$min_diff_count <= 0 and die "0 is inside traverse_deltas diff
+count range";
# stack[0] saves the states at each node level of a depth-firs
+t recursive tree traversal
#
# stack[1] when the primary traversal finds a word, stack[1] g
+ets a copy of stack[0]
# and performs a secondary continuation of the primar
+y traversal, specifically
# for finding words whose lengths match the primary w
+ord, and with character
# difference counts between $min_diff_count and $max_
+diff_count (inclusive).
# Start with one stack containing one element, a new context f
+or the trie's root node.
# Each node is a WordTrie object, for simplicity and because n
+o collection metadata
# was needed in this demo for a container object to maintain.
my @stacks = ([{node => $self}]);
while(@stacks) {
my $ctx = $stacks[-1][-1]; # get the active context, at th
+e top position of the active stack
unless($ctx->{child_keys}) {
# if "child_keys" is not defined, this context was jus
+t created.
# Traversal may return to this node more than once, an
+d an undefined "child_keys"
# acts as a flag for triggering one-time actions for t
+his node,
# like something that needs to be done once each time
+a complete word is found.
# create "child_keys", an array of single-character in
+dexes into this node's
# "children" hash. Each child node represents the app
+ending of its key character
# to this context's "word".
$ctx->{child_keys} = [sort keys %{$ctx->{node}->{child
+ren}}];
# Check if a word ends here. Even if this node termin
+ates a word, it may still
# have children. This happens when one word prefixes
+another longer word.
# For example, 'an' could terminate at this node, but
+if 'and' exists,
# it would terminate with this node's child whose key
+is 'd'
if($ctx->{node}->{terminates}) {
if(@stacks == 1) {
# The primary traversal found a complete word.
+ Now create a secondary
# traversal to search for matching words. The
+ secondary traversal continues
# from the primary context. Work would be was
+ted if it started at the beginning,
# because each matching word pair would be dup
+licated.
# This copies the primary context stack into a
+ secondary context stack.
# The procedure is complicated by the need to
+make a deep-copy of the "child_keys" arrays,
# in order to preserve the primary versions.
for my $ctx (@{$stacks[0]}) {
# This uses a key/value hash slice, a newi
+sh feature
push @{$stacks[1]}, { %{$ctx}{qw/node word
+ diff_count/}, child_keys => [@{$ctx->{child_keys}}] };
}
} elsif(@{$stacks[-1]} == @{$stacks[-2]}) {
# The secondary traversal hit a word whose len
+gth is the same as the primary.
# Check that the difference count is within th
+e desired range; if so, print the pair.
# Pairs are always ordered alphabetically (or
+however "sort" orders keys), with
# the primary word preceding the secondary.
if($ctx->{diff_count} >= $min_diff_count && $c
+tx->{diff_count} <= $max_diff_count) {
my ($pri, $sec, $diffs) = ($stacks[-2][-1]
+->{word}, $ctx->{word}, $ctx->{diff_count});
# This is where one would add custom code
+for handling each matching pair.
# The demo just prints the primary word, t
+he secondary word and their difference count.
print " $pri $sec ($diffs)\n";
} } }
# If a secondary traversal is being performed, do not
+traverse deeper than the primary
# traversal, because deeper words are all longer than
+the primary word.
@stacks > 1 && @{$stacks[0]} == @{$stacks[1]} and $ctx
+ = undef;
}
if(!$ctx || !@{$ctx->{child_keys}}) {
# if context was cleared (to cancel deeper traversal),
# or no children remain to be searched in this context
+,
# remove this context from the top of the active (top)
+ stack
# and if the active stack is now empty, remove it from
# the stack of context stacks, @stacks.
# Primary traversal takes place when @stacks has only
+one element,
# and the main loop ends when @stacks is empty.
pop @{$stacks[-1]};
@{$stacks[-1]} or pop @stacks;
} else {
# Children remain to be searched: get the next child's
+ key character
my $ch = shift @{$ctx->{child_keys}};
my $diff_count = 0+$ctx->{diff_count};
if(@stacks > 1) {
# For a secondary traversal, check whether the chi
+ld key
# just selected is the same as the primary word's
+character
# at the next depth. If they are different, incre
+ment
# "diff_count" between this context and the new ch
+ild
my $primary_word = $stacks[-2][@{$stacks[-1]}]->{w
+ord};
my ($primary_ch) = $primary_word =~ /(.)$/;
$ch ne $primary_ch and ++$diff_count;
# If difference count corresponding to this new ch
+ild is greater
# than the allowed maximum, cancel traversal into
+the child.
$diff_count > $max_diff_count and next;
}
# push a new child context
# "node" is this node's child, from its "children" has
+h, indexed by the next key, stored in $ch
# "word" is this context's word, appended with the chi
+ld's key character
# "diff_count" is this node's "diff_count", incremenet
+ed only during a secondary traversal,
# when the key character is different tha
+n the primary traversal's key character
# at the same depth as the child.
push @{$stacks[-1]}, { node => $ctx->{node}->{children
+}->{$ch}, word => $ctx->{word} . $ch, diff_count => $diff_count };
} } }
}
# Create an empty WordTrie object
my $trie = WordTrie->new;
# Example which loads words from the __DATA__ section at the bottom of
+ this file.
$trie->load(\*DATA);
## Example which loads words from a dictionary file named 'english-wor
+ds.95'
## This file has words which contain symbols like apostrophes, so
## a whitelisting regex is supplied for keeping just all-alpha words.
## Although the trie stores words in a case-insensitive manner, the lo
+ading regex
## is applied to words as they appear in the file, so it must take thi
+s into account.
# $trie->load(qr/^[a-z]+$/i, 'english-words.95');
## This prints all words contained in the trie
# $trie->full_traverse;
# This finds pairs which differ by only one charcter
$trie->traverse_deltas(1);
## This finds pairs which differ by between 1 and 3 characters.
# $trie->traverse_deltas(1, 3);
__DATA__
aardvark
aardbark
bbardvark
bardbark
bardvark
barddork
aaardvark