Re: Optimizing a naive clustering algorithm

BUU:

I've played around with your code a bit for fun. In one version, I used RichardK's suggestion, and gave a bit of a speedup. (I never let it run to completion, and each iteration gets slower and slower....). I then added caching so you don't need to compare the same pair of hashes more than once. That gave a significant speedup.

I then tried oiskuu's suggestion of building a half triangular distance matrix, and made a slightly different distance check, and it was able to cluster 1700 items of 5-100 words in about 25 minutes. (I didn't pay much attention to the word lengths, I just randomly selected a population of words from the dictionary.) I'm kind of curious about what results other monks'll get.

The final program is this bit:

#!/usr/bin/perl
#
#   a clustering algorithm
#
use strict;
use warnings;
use Data::Dumper;

$|=1;

my $begin = time;
my $items;
my %cache;
my $arr_cnt;
srand(0);
build_items(1700, 5, 100, 10000);
#build_items(500, 5, 50, 2500);

# Build cache
my $difference = 9999; #Arbitrary large number
for my $i ( 0 .. $#$items-1 ) {
    my $d1 = $items->[$i];
    for my $j ( $i+1 .. $#$items ) {
        my $d2 = $items->[$j];
        my $diff = max_diff( $d1, $d2 );
    }
}
my $cur = time - $begin;
print "cache built, $cur s\n";

# Build stitch list: i.e. list of things to tie together, by ordering 
+the cache by distance
my @stitch;
for my $k1 (keys %cache) {
    for my $k2 (keys %{$cache{$k1}}) {
        push @stitch, [ $k1, $k2, $cache{$k1}{$k2} ];
    }
}
@stitch = sort { $a->[2]<=>$b->[2] || $a->[0] cmp $b->[0] || $a->[1] c
+mp $b->[1] } @stitch;
$cur = time - $begin;
print "stitch list built, $cur s\n";

# build the clusters
my %clusters;
my %sretsulc;
my $clust_cnt=0;
for my $idx (0 .. $#stitch) {
    my ($k1, $k2, $d) = @{$stitch[$idx]};
    my $fl='N';
    my ($cl1, $cl2, $msg);
    $cl1 = $sretsulc{$k1} if exists $sretsulc{$k1};
    $cl2 = $sretsulc{$k1} if exists $sretsulc{$k2};


    if (!defined $cl1) {
        # Remove one special case, leaving:  CL+CL, CL+K, K+K
        ($k1, $k2, $cl1, $cl2) = ($k2, $k1, $cl2, $cl1);
        $fl='Y';
    }
    $msg = "$idx: Distance $d ($k1 <-$fl-> $k2) ";

    if (defined $cl1 and defined $cl2) {
        if ($cl1 eq $cl2) {
            #print "\t$k1 and $k2 are in same cluster ($cl1)\n";
            next;
        }
        $msg .= "\tJoining $cl1 ($k1) and $cl2 ($k2)";
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $clusters{$cl3}{L} = $cl1;
        $clusters{$cl3}{R} = $cl2;
        $clusters{$cl1}{P} = $cl3;
        $clusters{$cl2}{P} = $cl3;
        my $size=0;
        for my $k (keys %sretsulc) {
            my $cl = $sretsulc{$k};
            if ($cl eq $cl1 or $cl eq $cl2) {
                $sretsulc{$k} = $cl3;
                ++$size;
            }
        }
        $msg .= "  new cluster: $size items";
    }
    elsif (defined $cl1) {
        # build new cluster of cl1 and k2
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $msg .= "\tjoining $cl1 and $k2 into $cl3";
        $clusters{$cl3}{L} = $cl1;
        $clusters{$cl3}{R} = $k2;
        $clusters{$cl1}{P} = $cl3;
        my $size=0;
        for my $k (keys %sretsulc) {
            my $cl = $sretsulc{$k};
            if ($cl eq $cl1) {
                $sretsulc{$k} = $cl3;
                ++$size;
            }
        }
        $sretsulc{$k2}=$cl3;
        ++$size;
        $msg .= " new cluster: $size items";
    }
    else {
        # Two unclustered items
        ++$clust_cnt;
        my $cl3 = "clust $clust_cnt";
        $msg .= "\tjoining $k1 and $k2 into $cl3";
        $clusters{$cl3}{L} = $k1;
        $clusters{$cl3}{R} = $k2;
        $sretsulc{$k1}=$cl3;
        $sretsulc{$k2}=$cl3;
    }

    print $msg, "\n";
    
}
$cur = time - $begin;
print "clusters built, $cur s\n";




sub merge {
    my( $x, $y ) = @_;

    # Both non-clusters
    if( ref $x eq 'HASH' and ref $y eq 'HASH' ) {
        ++$arr_cnt;
        print "\tmerging hashes $x->{name} and $y->{name} into <arr $a
+rr_cnt>\n";
        return [$x,$y, "arr $arr_cnt" ];
    }
    # $x cluster
    elsif( ref $x eq 'ARRAY' and ref $y eq 'HASH' ) {
        ++$arr_cnt;
        print "\tmerging $x->[2] and $y->{name} into <arr $arr_cnt>\n"
+;
        return [$x,$y, "arr $arr_cnt" ];
    }
    # $y cluster
    elsif( ref $x eq 'HASH' and ref $y eq 'ARRAY' ) {
        ++$arr_cnt;
        print "\tmerging $x->{name} and $y->[2] into <arr $arr_cnt>\n"
+;
        return [$y,$x, "arr $arr_cnt" ];
    }
    elsif( ref $x eq 'ARRAY' and ref $y eq 'ARRAY' ) {
        ++$arr_cnt;
        print "\tmerging $x->[2] and $y->[2] into <arr $arr_cnt>\n";
        return [$x,$y, "arr $arr_cnt" ];
    }
    else {
        die "Wtf? $x $y";
    }
}

sub max_diff {
    my( $d1, $d2 ) = @_;
    if( ref $d1 eq 'HASH' and ref $d2 eq 'HASH' ) {
        my ($name1,$name2) = ($d1->{name}, $d2->{name});
        ($name1,$name2) = ($name1 lt $name2) ? ($name1, $name2) : ($na
+me2, $name1);
        if (exists $cache{$name1}{$name2}) {
            return $cache{$name1}{$name2};
        }
        my $t=0;
        for (keys %{$d1->{words}}) { ++$t if ! exists $d2->{words}{$_}
+ }
        for (keys %{$d2->{words}}) { ++$t if ! exists $d1->{words}{$_}
+ }
        $cache{$name1}{$name2} = $t;
        return $t;
    }
    elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'HASH' ) {
        my $x = max_diff( $d1->[0], $d2 );
        my $y = max_diff( $d1->[1], $d2 );

        return $x > $y ? $x : $y;
    }
    elsif( ref $d1 eq 'HASH' and ref $d2 eq 'ARRAY' ) {
        my $x = max_diff( $d2->[0], $d1 );
        my $y = max_diff( $d2->[1], $d1 );

        return $x > $y ? $x : $y;
    }
    elsif( ref $d1 eq 'ARRAY' and ref $d2 eq 'ARRAY' ) {
        my $x = max_diff( $d1->[0], $d2->[0] );
        my $y = max_diff( $d1->[1], $d2->[1] );
        my $xx = max_diff( $d1->[0], $d2->[1] );
        my $yy = max_diff( $d1->[1], $d2->[0] );

        return max( $x, $y, $xx, $yy );
    }
    else {
        die "Wtffffff $d1 $d2";
    }
}

sub max {
    my ($ret, @t) = @_;
    for (@t) {
        $ret = $_ if $_ > $ret;
    }
    return $ret;
}

sub build_items {
    # Build an array of $num_items, where each item is $it_min - $it_m
+ax distinct (non-dup) words
    my ($num_items, $it_min, $it_max, $num_words) = @_;

    $num_words = $it_min * $num_items if ! defined $num_words;

    # Read the dictionary
    my @words;
    {
        my %words;
        open my $FH, '<', '/etc/dictionaries-common/words';
        while (<$FH>) {
            s/\s+$//;
            next if /'s$/;
            $words{$_}=0;
        }
        @words = keys %words;
    }
    print "Dictionary had ", scalar(@words), " words\n";

    # Make a list of words
    {
        my %dict;
        while (keys %dict < $num_words) {
            my $idx = int(rand()*@words);
            $dict{$words[$idx]}=0;
            $words[$idx] = pop @words;
        }
        @words = keys %dict;
        print "Trimmed dictionary had ", scalar(@words), " words\n";
    }

    for my $it_idx (0 .. $num_items-1) {
        my %item_words;
        my $it_cnt = $it_min + int(rand()*($it_max-$it_min));
        for (1 .. $it_cnt) {
            $item_words{$words[int rand()*@words]}=0;
        }
        $$items[$it_idx] = {
            words => { map { $_=>0 } keys %item_words },
            name  => "item $it_idx",
        };
    }
}
[download]

If you use your original code between the "build clusters" through "clusters built" with code below, then you should have the version with caching (more or less).

...roboticus

When your only tool is a hammer, all problems look like your thumb.

Comment on Re: Optimizing a naive clustering algorithm Download Code


Clear questions and runnable code get the best and fastest answer
	PerlMonks