The final program is this bit:
#!/usr/bin/perl
#
# a clustering algorithm
#
use strict;
use warnings;
use Data::Dumper;
$|=1;
my $begin = time;
my $items;
my %cache;
my $arr_cnt;
srand(0);
build_items(1700, 5, 100, 10000);
#build_items(500, 5, 50, 2500);
# Build cache
my $difference = 9999; #Arbitrary large number
for my $i ( 0 .. $#$items-1 ) {
my $d1 = $items->[$i];
for my $j ( $i+1 .. $#$items ) {
my $d2 = $items->[$j];
my $diff = max_diff( $d1, $d2 );
}
}
my $cur = time - $begin;
print "cache built, $cur s\n";
# Build stitch list: i.e. list of things to tie together, by ordering
+the cache by distance
my @stitch;
for my $k1 (keys %cache) {
for my $k2 (keys %{$cache{$k1}}) {
push @stitch, [ $k1, $k2, $cache{$k1}{$k2} ];
}
}
@stitch = sort { $a->[2]<=>$b->[2] || $a->[0] cmp $b->[0] || $a->[1] c
+mp $b->[1] } @stitch;
$cur = time - $begin;
print "stitch list built, $cur s\n";
# build the clusters
my %clusters;
my %sretsulc;
my $clust_cnt=0;
for my $idx (0 .. $#stitch) {
my ($k1, $k2, $d) = @{$stitch[$idx]};
my $fl='N';
my ($cl1, $cl2, $msg);
$cl1 = $sretsulc{$k1} if exists $sretsulc{$k1};
$cl2 = $sretsulc{$k1} if exists $sretsulc{$k2};
if (!defined $cl1) {
# Remove one special case, leaving: CL+CL, CL+K, K+K
($k1, $k2, $cl1, $cl2) = ($k2, $k1, $cl2, $cl1);
$fl='Y';
}
$msg = "$idx: Distance $d ($k1 <-$fl-> $k2) ";
if (defined $cl1 and defined $cl2) {
if ($cl1 eq $cl2) {
#print "\t$k1 and $k2 are in same cluster ($cl1)\n";
next;
}
$msg .= "\tJoining $cl1 ($k1) and $cl2 ($k2)";
++$clust_cnt;
my $cl3 = "clust $clust_cnt";
$clusters{$cl3}{L} = $cl1;
$clusters{$cl3}{R} = $cl2;
$clusters{$cl1}{P} = $cl3;
$clusters{$cl2}{P} = $cl3;
my $size=0;
for my $k (keys %sretsulc) {
my $cl = $sretsulc{$k};
if ($cl eq $cl1 or $cl eq $cl2) {
$sretsulc{$k} = $cl3;
++$size;
}
}
$msg .= " new cluster: $size items";
}
elsif (defined $cl1) {
# build new cluster of cl1 and k2
++$clust_cnt;
my $cl3 = "clust $clust_cnt";
$msg .= "\tjoining $cl1 and $k2 into $cl3";
$clusters{$cl3}{L} = $cl1;
$clusters{$cl3}{R} = $k2;
$clusters{$cl1}{P} = $cl3;
my $size=0;
for my $k (keys %sretsulc) {
my $cl = $sretsulc{$k};
if ($cl eq $cl1) {
$sretsulc{$k} = $cl3;
++$size;
}
}
$sretsulc{$k2}=$cl3;
++$size;
$msg .= " new cluster: $size items";
}
else {
# Two unclustered items
++$clust_cnt;
my $cl3 = "clust $clust_cnt";
$msg .= "\tjoining $k1 and $k2 into $cl3";
$clusters{$cl3}{L} = $k1;
$clusters{$cl3}{R} = $k2;
$sretsulc{$k1}=$cl3;
$sretsulc{$k2}=$cl3;
}
print $msg, "\n";
}
$cur = time - $begin;
print "clusters built, $cur s\n";
sub merge {
my( $x, $y ) = @_;
# Both non-clusters
if( ref $x eq 'HASH' and ref $y eq 'HASH' ) {
++$arr_cnt;
print "\tmerging hashes $x->{name} and $y->{name} into <arr $a
+rr_cnt>\n";
return [$x,$y, "arr $arr_cnt" ];
}
# $x cluster
elsif( ref $x eq 'ARRAY' and ref $y eq 'HASH' ) {
++$arr_cnt;
print "\tmerging $x->[2] and $y->{name} into <arr $arr_cnt>\n"
+;
return [$x,$y, "arr $arr_cnt" ];
}
# $y cluster
elsif( ref $x eq 'HASH' and ref $y eq 'ARRAY' ) {
++$arr_cnt;
print "\tmerging $x->{name} and $y->[2] into <arr $arr_cnt>\n"
+;
return [$y,$x, "arr $arr_cnt" ];
}
elsif( ref $x eq 'ARRAY' and ref $y eq 'ARRAY' ) {
++$arr_cnt;
print "\tmerging $x->[2] and $y->[2] into <arr $arr_cnt>\n";
return [$x,$y, "arr $arr_cnt" ];
}
else {
die "Wtf? $x $y";
}
}
sub max_diff {
my( $d1, $d2 ) = @_;
if( ref $d1 eq 'HASH' and ref $d2 eq 'HASH' ) {
my ($name1,$name2) = ($d1->{name}, $d2->{name});
($name1,$name2) = ($name1 lt $name2) ? ($name1, $name2) : ($na
+me2, $name1);
if (exists $cache{$name1}{$name2}) {
return $cache{$name1}{$name2};
}
my $t=0;
for (keys %{$d1->{words}}) { ++$t if ! exists $d2->{words}{$_}
+ }
for (keys %{$d2->{words}}) { ++$t if ! exists $d1->{words}{$_}
+ }
$cache{$name1}{$name2} = $t;
return $t;
}
elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'HASH' ) {
my $x = max_diff( $d1->[0], $d2 );
my $y = max_diff( $d1->[1], $d2 );
return $x > $y ? $x : $y;
}
elsif( ref $d1 eq 'HASH' and ref $d2 eq 'ARRAY' ) {
my $x = max_diff( $d2->[0], $d1 );
my $y = max_diff( $d2->[1], $d1 );
return $x > $y ? $x : $y;
}
elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'ARRAY' ) {
my $x = max_diff( $d1->[0], $d2->[0] );
my $y = max_diff( $d1->[1], $d2->[1] );
my $xx = max_diff( $d1->[0], $d2->[1] );
my $yy = max_diff( $d1->[1], $d2->[0] );
return max( $x, $y, $xx, $yy );
}
else {
die "Wtffffff $d1 $d2";
}
}
sub max {
my ($ret, @t) = @_;
for (@t) {
$ret = $_ if $_ > $ret;
}
return $ret;
}
sub build_items {
# Build an array of $num_items, where each item is $it_min - $it_m
+ax distinct (non-dup) words
my ($num_items, $it_min, $it_max, $num_words) = @_;
$num_words = $it_min * $num_items if ! defined $num_words;
# Read the dictionary
my @words;
{
my %words;
open my $FH, '<', '/etc/dictionaries-common/words';
while (<$FH>) {
s/\s+$//;
next if /'s$/;
$words{$_}=0;
}
@words = keys %words;
}
print "Dictionary had ", scalar(@words), " words\n";
# Make a list of words
{
my %dict;
while (keys %dict < $num_words) {
my $idx = int(rand()*@words);
$dict{$words[$idx]}=0;
$words[$idx] = pop @words;
}
@words = keys %dict;
print "Trimmed dictionary had ", scalar(@words), " words\n";
}
for my $it_idx (0 .. $num_items-1) {
my %item_words;
my $it_cnt = $it_min + int(rand()*($it_max-$it_min));
for (1 .. $it_cnt) {
$item_words{$words[int rand()*@words]}=0;
}
$$items[$it_idx] = {
words => { map { $_=>0 } keys %item_words },
name => "item $it_idx",
};
}
}
If you use your original code between the "build clusters" through "clusters built" with code below, then you should have the version with caching (more or less).