comment on

BUU:

I've played around with your code a bit for fun. In one version, I used RichardK's suggestion, and gave a bit of a speedup. (I never let it run to completion, and each iteration gets slower and slower....). I then added caching so you don't need to compare the same pair of hashes more than once. That gave a significant speedup.

I then tried oiskuu's suggestion of building a half triangular distance matrix, and made a slightly different distance check, and it was able to cluster 1700 items of 5-100 words in about 25 minutes. (I didn't pay much attention to the word lengths, I just randomly selected a population of words from the dictionary.) I'm kind of curious about what results other monks'll get.

The final program is this bit:

#!/usr/bin/perl
#
#   a clustering algorithm
#
use strict;
use warnings;
use Data::Dumper;

$|=1;

my $begin = time;
my $items;
my %cache;
my $arr_cnt;
srand(0);
build_items(1700, 5, 100, 10000);
#build_items(500, 5, 50, 2500);

# Build cache
my $difference = 9999; #Arbitrary large number
for my $i ( 0 .. $#$items-1 ) {
    my $d1 = $items->[$i];
    for my $j ( $i+1 .. $#$items ) {
        my $d2 = $items->[$j];
        my $diff = max_diff( $d1, $d2 );
    }
}
my $cur = time - $begin;
print "cache built, $cur s\n";

# Build stitch list: i.e. list of things to tie together, by ordering 
+the cache by distance
my @stitch;
for my $k1 (keys %cache) {
    for my $k2 (keys %{$cache{$k1}}) {
        push @stitch, [ $k1, $k2, $cache{$k1}{$k2} ];
    }
}
@stitch = sort { $a->[2]<=>$b->[2] || $a->[0] cmp $b->[0] || $a->[1] c
+mp $b->[1] } @stitch;
$cur = time - $begin;
print "stitch list built, $cur s\n";

# build the clusters
my %clusters;
my %sretsulc;
my $clust_cnt=0;
for my $idx (0 .. $#stitch) {
    my ($k1, $k2, $d) = @{$stitch[$idx]};
    my $fl='N';
    my ($cl1, $cl2, $msg);
    $cl1 = $sretsulc{$k1} if exists $sretsulc{$k1};
    $cl2 = $sretsulc{$k1} if exists $sretsulc{$k2};


    if (!defined $cl1) {
        # Remove one special case, leaving:  CL+CL, CL+K, K+K
        ($k1, $k2, $cl1, $cl2) = ($k2, $k1, $cl2, $cl1);
        $fl='Y';
    }
    $msg = "$idx: Distance $d ($k1 <-$fl-> $k2) ";

    if (defined $cl1 and defined $cl2) {
        if ($cl1 eq $cl2) {
            #print "\t$k1 and $k2 are in same cluster ($cl1)\n";
            next;
        }
        $msg .= "\tJoining $cl1 ($k1) and $cl2 ($k2)";
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $clusters{$cl3}{L} = $cl1;
        $clusters{$cl3}{R} = $cl2;
        $clusters{$cl1}{P} = $cl3;
        $clusters{$cl2}{P} = $cl3;
        my $size=0;
        for my $k (keys %sretsulc) {
            my $cl = $sretsulc{$k};
            if ($cl eq $cl1 or $cl eq $cl2) {
                $sretsulc{$k} = $cl3;
                ++$size;
            }
        }
        $msg .= "  new cluster: $size items";
    }
    elsif (defined $cl1) {
        # build new cluster of cl1 and k2
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $msg .= "\tjoining $cl1 and $k2 into $cl3";
        $clusters{$cl3}{L} = $cl1;
        $clusters{$cl3}{R} = $k2;
        $clusters{$cl1}{P} = $cl3;
        my $size=0;
        for my $k (keys %sretsulc) {
            my $cl = $sretsulc{$k};
            if ($cl eq $cl1) {
                $sretsulc{$k} = $cl3;
                ++$size;
            }
        }
        $sretsulc{$k2}=$cl3;
        ++$size;
        $msg .= " new cluster: $size items";
    }
    else {
        # Two unclustered items
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $msg .= "\tjoining $k1 and $k2 into $cl3";
        $clusters{$cl3}{L} = $k1;
        $clusters{$cl3}{R} = $k2;
        $sretsulc{$k1}=$cl3;
        $sretsulc{$k2}=$cl3;
    }

    print $msg, "\n";
    
}
$cur = time - $begin;
print "clusters built, $cur s\n";




sub merge {
    my( $x, $y ) = @_;

    # Both non-clusters
    if( ref $x eq 'HASH' and ref $y eq 'HASH' ) {
        ++$arr_cnt;
        print "\tmerging hashes $x->{name} and $y->{name} into <arr $a
+rr_cnt>\n";
        return [$x,$y, "arr $arr_cnt" ];
    }
    # $x cluster
    elsif( ref $x eq 'ARRAY' and ref $y eq 'HASH' ) {
        ++$arr_cnt;
        print "\tmerging $x->[2] and $y->{name} into <arr $arr_cnt>\n"
+;
        return [$x,$y, "arr $arr_cnt" ];
    }
    # $y cluster
    elsif( ref $x eq 'HASH' and ref $y eq 'ARRAY' ) {
        ++$arr_cnt;
        print "\tmerging $x->{name} and $y->[2] into <arr $arr_cnt>\n"
+;
        return [$y,$x, "arr $arr_cnt" ];
    }
    elsif( ref $x eq 'ARRAY' and ref $y eq 'ARRAY' ) {
        ++$arr_cnt;
        print "\tmerging $x->[2] and $y->[2] into <arr $arr_cnt>\n";
        return [$x,$y, "arr $arr_cnt" ];
    }
    else {
        die "Wtf? $x $y";
    }
}

sub max_diff {
    my( $d1, $d2 ) = @_;
    if( ref $d1 eq 'HASH' and ref $d2 eq 'HASH' ) {
        my ($name1,$name2) = ($d1->{name}, $d2->{name});
        ($name1,$name2) = ($name1 lt $name2) ? ($name1, $name2) : ($na
+me2, $name1);
        if (exists $cache{$name1}{$name2}) {
            return $cache{$name1}{$name2};
        }
        my $t=0;
        for (keys %{$d1->{words}}) { ++$t if ! exists $d2->{words}{$_}
+ }
        for (keys %{$d2->{words}}) { ++$t if ! exists $d1->{words}{$_}
+ }
        $cache{$name1}{$name2} = $t;
        return $t;
    }
    elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'HASH' ) {
        my $x = max_diff( $d1->[0], $d2 );
        my $y = max_diff( $d1->[1], $d2 );

        return $x > $y ? $x : $y;
    }
    elsif( ref $d1 eq 'HASH' and ref $d2 eq 'ARRAY' ) {
        my $x = max_diff( $d2->[0], $d1 );
        my $y = max_diff( $d2->[1], $d1 );

        return $x > $y ? $x : $y;
    }
    elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'ARRAY' ) {
        my $x = max_diff( $d1->[0], $d2->[0] );
        my $y = max_diff( $d1->[1], $d2->[1] );
        my $xx = max_diff( $d1->[0], $d2->[1] );
        my $yy = max_diff( $d1->[1], $d2->[0] );

        return max( $x, $y, $xx, $yy );
    }
    else {
        die "Wtffffff $d1 $d2";
    }
}

sub max {
    my ($ret, @t) = @_;
    for (@t) {
        $ret = $_ if $_ > $ret;
    }
    return $ret;
}

sub build_items {
    # Build an array of $num_items, where each item is $it_min - $it_m
+ax distinct (non-dup) words
    my ($num_items, $it_min, $it_max, $num_words) = @_;

    $num_words = $it_min * $num_items if ! defined $num_words;

    # Read the dictionary
    my @words;
    {
        my %words;
        open my $FH, '<', '/etc/dictionaries-common/words';
        while (<$FH>) {
            s/\s+$//;
            next if /'s$/;
            $words{$_}=0;
        }
        @words = keys %words;
    }
    print "Dictionary had ", scalar(@words), " words\n";

    # Make a list of words
    {
        my %dict;
        while (keys %dict < $num_words) {
            my $idx = int(rand()*@words);
            $dict{$words[$idx]}=0;
            $words[$idx] = pop @words;
        }
        @words = keys %dict;
        print "Trimmed dictionary had ", scalar(@words), " words\n";
    }

    for my $it_idx (0 .. $num_items-1) {
        my %item_words;
        my $it_cnt = $it_min + int(rand()*($it_max-$it_min));
        for (1 .. $it_cnt) {
            $item_words{$words[int rand()*@words]}=0;
        }
        $$items[$it_idx] = {
            words => { map { $_=>0 } keys %item_words },
            name  => "item $it_idx",
        };
    }
}
[download]

If you use your original code between the "build clusters" through "clusters built" with code below, then you should have the version with caching (more or less).

...roboticus

When your only tool is a hammer, all problems look like your thumb.

In reply to Re: Optimizing a naive clustering algorithm by roboticus
in thread Optimizing a naive clustering algorithm by BUU

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


more useful options
	PerlMonks