#!/usr/bin/perl -w # k Means demo program # Adapted from a VB program by Kardi Teknomo # http://people.revoledu.com/kardi/tutorial/kMean/index.html # Ported to Perl/Tk by bart @ Perlmonks use strict; # ----------------------- Tk interface ------------------------- use Tk; my $mw = MainWindow->new( -height => 403, -width => 477, -title => "k Means Clustering, adapted from tutorial by Kardi Teknomo" ); my $button_reset = $mw->Button(-text => "Clear Data", -relief => "raised", -command => \&reset_click); $button_reset->place( -x => 176, -y => 24, -height => 25, -width => 65); { my $label = $mw->Label( -text => "Click data in the canvas below. The program will automatically cluster the data by color code." ); $label->place( -x => 0, -y => 0, -height => 16, -width => 473); $label = $mw->Label(-text => "Number of clusters"); $label->place( -x => 10, -y => 28, -height => 18, -width => 95); $label = $mw->Label(-text => "(X, Y)", -justify => 'right'); $label->place( -x => 280, -y => 24, -height => 13, -width => 40); } my $label_xy = $mw->Label( -text => "X, Y"); $label_xy->place( -x => 330, -y => 24, -height => 13, -width => 50); my $clusters_entry = $mw->Entry(-relief => "sunken", -disabledforeground => 'darkgray'); $clusters_entry->place( -x => 112, -y => 24, -height => 24, -width => 24); $clusters_entry->insert('end', '3'); my $canvas = $mw->Scrolled('Canvas', -scrollbars => '', -background => 'white'); $canvas->place( -x => 0, -y => 56, -height => 403-56, -width => 477); $canvas->CanvasBind( "", [ \&canvas_click, Ev('x'), Ev('y')]); $canvas->CanvasBind( "", [ \&canvas_mousemove, Ev('x'), Ev('y')]); $canvas->CanvasBind( "", [ \&canvas_mouseleave ]); #----------------------- Event Handlers ------------------------ my @color = qw(red yellow green cyan blue purple gray magenta pink chartreuse coral darkolivegreen); # If you want to be able to have more clusters, add more colours. my(%point, @cluster, $dataset); sub reset_click { $canvas->delete('all'); (@cluster, %point, $dataset) = (); $clusters_entry->configure(-state => 'normal'); } sub canvas_mousemove { my $canvas = shift; my($x, $y) = @_; $label_xy->configure(-text => "($x, $y)"); } sub canvas_mouseleave { my $canvas = shift; $label_xy->configure(-text => ""); } sub canvas_click { my $canvas = shift; my($x, $y) = @_; if(!$dataset) { my $clusters = $clusters_entry->get; if($clusters !~ /^\d+$/ or $clusters == 0 or $clusters > @color) { warn "Not a valid value for cluster count"; return; } $dataset = Data::Cluster::kMean->new(0+$clusters) or die "Failed to make object"; $clusters_entry->configure(-state => 'disabled'); } $dataset->add(my $point = [ $x, $y ]); # A point is an array ref with coordinates my %record = ( data => $point, cluster => -1 ); $record{id} = $canvas->createLine($x, $y, $x, $y, -fill => 'red', -width => 8, -capstyle => 'round', -tags => ['dot'], ); # Keep track of point properties using a stringified reference to the point coordinates array $point{$point} = \%record; foreach my $cluster ($dataset->clusters) { my $i = $cluster->index; my $r = $cluster[$i] ||= { obj => $cluster, id => $canvas->createText($x, $y, -anchor => 'c', -width => 150, tag => 'label', -text => 1+$i) }; # Move centroid label my($x, $y) = @{$cluster->centroid}; $canvas->coords($r->{id}, $x, $y); # Colour dots according to cluster foreach my $p ($cluster->points) { if($point{$p}{cluster} != $i) { $point{$p}{cluster} = $i; $canvas->itemconfigure($point{$p}{id}, -fill => $color[$i]); } } } $canvas->raise('label', 'dot'); } #------------------------ Main Program ------------------------- MainLoop; #------------------------ kMean module ------------------------- package Data::Cluster::kMean; use List::Util qw(sum); sub new { my $class = shift; my($max_clusters) = @_; return bless { max_clusters => $max_clusters, data => [], cluster => [], clusters => [] }, $class; } sub add { # add data point(s) (array references) -- by reference, so make sure they're not reused for something else my $self = shift; return unless @_; unless(ref $_[0] eq 'ARRAY') { @_ = [ @_ ]; } foreach my $p (@_) { push @{$self->{data}}, $p; push @{$self->{cluster}}, -1; # not in a cluster if(@{$self->{clusters}} < $self->{max_clusters}) { my $index = @{$self->{clusters}}; push @{$self->{clusters}}, Data::Cluster::kMean::Cluster->new($self, $index); $self->{cluster}[-1] = $index; } else { my $c; { my $j = 0; my $min_dist; for my $cluster (@{$self->{clusters}}) { my $dist = _dist($p, $cluster->centroid); if(!defined $min_dist or $dist < $min_dist) { $c = $j; $min_dist = $dist; } } continue { $j++; } } $self->{clusters}[$c]->invalidate; $self->{cluster}[-1] = $c; my $is_still_moving = 1; while($is_still_moving) { # this loop will surely converge my @centroid = map $_->centroid, @{$self->{clusters}}; # assign all data to the new centroids $is_still_moving = 0; my $i = 0; for my $p (@{$self->{data}}) { my $c; { my $min_dist; for my $j (0 .. $#{$self->{clusters}}) { my $dist = _dist($p, $centroid[$j]); if(!defined $min_dist or $dist < $min_dist) { $c = $j; $min_dist = $dist; } } } if($c != $self->{cluster}[$i]) { $self->{clusters}[$self->{cluster}[$i]]->invalidate; $self->{clusters}[$c]->invalidate; $self->{cluster}[$i] = $c; $is_still_moving = 1; } } continue { $i++; } } } } } sub clusters { # Returns a list of all Cluster objects my $self = shift; return @{$self->{clusters}}; } sub _dist { # function return sqrt(sum map { my $d = $_[0][$_]-$_[1][$_]; $d*$d } 0 .. $#{$_[0]}); } package Data::Cluster::kMean::Cluster; use List::Util qw(sum); sub new { my $class = shift; my($parent, $index) = @_; bless { index => $index, data => $parent->{data}, cluster => $parent->{cluster}, centroid => undef}, $class; } sub points { # Returns a list of all points in cluster my $self = shift; my $index = $self->{index}; my @point = @{$self->{data}}[grep $self->{cluster}[$_] == $index, 0 .. $#{$self->{data}}]; return @point; } sub centroid { # Returns a point indicating the cluster's center of gravity my $self = shift; return $self->{centroid} ||= _centroid($self->points); } sub _centroid { # function return undef unless @_; my $dim = @{$_[0]}; return [ map { my $i = $_; sum(map $_->[$i], @_) / @_ } 0 .. $dim-1 ]; } sub invalidate { # Throw away cache my $self = shift; undef $self->{centroid}; } sub index { # integer, position in cluster array of parent my $self = shift; return $self->{index}; } 1;