This module converts the string to be anagrammed into a 26-byte representation of itself, where each byte's ASCII value is the number of times a given letter appears in the string. A string like "preinitialization" becomes
\2\0\0\0\1\0\0\0\5\0\0\1\0\2\1\1\0\1\0\2\0\0\0\0\0\1.
Then, candidate substring anagrams are converted into the 26-byte representations too, and then into regexes (so \3 becomes [\0-\3] and so on). The main string is matched against the regex, and if it succeeds, the values in that substring are removed from the main string.
Here's code that uses the module:
#!/usr/bin/perl
use Anagram26;
use strict;
my $file = shift;
my $words = build_word_list($file, @ARGV);
add_more_words($words, qw(
a I
I'm you're he's she's we're they're
I'll you'll he'll she'll we'll they'll
I've you've we've they've
it's that's what's
isn't can't won't don't doesn't
));
print "Anagrams of '@ARGV':\n";
while (my $word = get_anagram($words)) {
print "-> $word\n";
}
# or wait until they're all computed
# and get them in a hash via:
# my $anagram_hash = anagrams($words);
And here's the module. It supports a forking interface (via the
get_anagram() function) and a wait-till-the-end interface (via the
anagrams() function).
package Anagram26;
use strict;
require Exporter;
@Anagram26::ISA = qw( Exporter );
@Anagram26::EXPORT = qw(
build_word_list
add_more_words
get_anagram
anagrams
);
my @VAL = ('\0', map "[\\0-\\$_]", 1 .. 255);
my $SPAWNED = 0;
my $CHILD;
sub build_word_list {
my $file = shift;
(my $chars = lc join "", @_) =~ tr/a-z//cd;
my $len = length $chars;
my $rx = qr/[^$chars]/i;
my @words = ($chars);
open WORDS, "< $file" or die "can't read $file: $!";
while (<WORDS>) {
chomp;
next if $len < length or /$rx/;
push @{ $words[length]{frequency(lc)} }, $_;
}
close WORDS;
for (keys %{ $words[-1] }) {
warn freq_to_str($_), " => (@{ $words[-1]{$_} })\n";
}
return \@words;
}
sub add_more_words {
my $words = shift;
for (@_) {
(my $canon = lc $_) =~ tr/a-z//cd;
push @{ $words->[length $canon]{frequency($canon)} }, $_;
}
}
sub get_anagram {
&spawn if $SPAWNED == 0;
my $ana = <IN>;
if (defined $ana) { chomp $ana }
else { close IN }
return $ana;
}
sub spawn {
pipe IN, OUT;
select((select(OUT), $|=1)[0]);
$SPAWNED = 1;
my $pid = fork;
if ($pid) {
$CHILD = $pid;
close OUT;
}
elsif (defined $pid) {
close IN;
&anagrams;
exit;
}
else {
die "fork failed: $!";
}
}
sub anagrams {
my $words = shift;
my $chars = $words->[0];
my %seen;
my $ana;
$ana = sub {
my ($str, $len, $current, $prune) = @_;
my $rx = freq_to_rx($str);
if ($len == 0) {
for (expand(@$current)) {
my $ana = join " ", sort split;
if ($seen{$ana}++ == 0 and $SPAWNED) { print OUT "$ana\n" }
return;
}
}
for (reverse(1 .. $prune)) {
my $l = $words->[$_];
for my $w (grep /$rx/, keys %$l) {
my $p = ($_, $len - $_)[$_ > $len/2];
push @$current, $l->{$w};
$ana->(remove($str, $w, $len), $current, $p);
pop @$current;
}
}
};
$ana->(frequency($chars), length($chars), [], @$words-1);
close OUT if $SPAWNED;
return \%seen;
}
sub frequency {
my $word = shift;
my $s = "\0" x 26;
++vec($s, ord($_) - ord('a'), 8) for split //, $word;
return $s;
}
sub remove {
my ($str, $rem, $len) = @_;
my $o = 0;
vec($str, $o++, 8) -= ord, $len -= ord for split //, $rem;
return ($str, $len);
}
sub freq_to_rx {
my $rx = join "", @VAL[map ord, split //, shift];
qr/^$rx$/;
}
sub freq_to_str {
my $c = 'a';
join "", map $c++ x ord, split //, shift;
}
sub expand {
return @{ +shift } if @_ == 1;
return map { my $f = $_; map "$f $_", expand(@_) } @{ +shift };
}
END {
kill TERM => $CHILD if defined $CHILD;
}
1;
In reply to Anagram26
by japhy
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.