#!/usr/bin/perl use warnings; use strict; use PDL; # fcm: fuzzy c-means implementation in Perl # usage: $fcm [number_of_clusters] [fuzzification_factor] # [max_iter] [tolerace] # returns: prototypes, partition_matrix # # # reading data # my ( @data, @tmp, $number_of_patterns, $max_row_number, $max_column_number ); while (defined(my $line = )) { chomp ($line); @tmp = split /\s+/, $line; push @data, [ @tmp ]; } $number_of_patterns = @data; my $patterns = pdl(@data); # # assigning other variables # my $number_of_clusters = shift @ARGV; my $fuzzification_factor = shift @ARGV; my $max_iter = shift @ARGV; my $tolerance = shift @ARGV; unless (defined($number_of_clusters)) { $number_of_clusters ||= 2; } unless (defined($fuzzification_factor)) { $fuzzification_factor ||= 2.0; } unless (defined($max_iter)) { $max_iter ||= 40; } unless (defined($tolerance)) { $tolerance ||= 0.00001; } $number_of_clusters = abs($number_of_clusters); $fuzzification_factor = abs($fuzzification_factor); $max_iter = abs($max_iter); $tolerance = abs($tolerance); # # initializing partition matrices # my $previous_partition_matrix; my $current_partition_matrix = initialize_partition_matrix($number_of_clusters, $number_of_patterns); # # output variables # my $prototypes; my $performance_index; # # fuzzy c means implementation # $max_row_number = $number_of_patterns - 1; $max_column_number = $number_of_clusters - 1; my $iter = 0; while (1) { # computing each prototype my $temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor; my $temp_prototypes = mv( $temporal_partition_matrix x $patterns,1,0) / sumover($temporal_partition_matrix); $prototypes = mv($temp_prototypes,1,0); # copying partition matrix $previous_partition_matrix = $current_partition_matrix->copy; # updating the partition matrix my $dist = zeroes $number_of_patterns, $number_of_clusters; for my $i (0..$max_row_number){ for my $j (0..$max_column_number){ my $temp_distance = distance($patterns->slice(":,$i"), $prototypes->slice(":,$j"), \&euclidean ); $dist->set($i, $j, $temp_distance); } } my $temp_variable = $dist ** (-2/($fuzzification_factor - 1)); $current_partition_matrix = $temp_variable / sumover(mv($temp_variable,1,0)); # # Performance Index calculation # $temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor; $performance_index = sum($temporal_partition_matrix * ( $dist ** 2 )); # checking stop conditions my $diff_partition_matrix = $current_partition_matrix - $previous_partition_matrix; $iter++; if ( ($diff_partition_matrix->max < $tolerance) || ($iter > $max_iter) ) { last; } print "iter = $iter\n"; } print "=======================================\n"; print "clustering completed\n"; print "performance index = $performance_index\n"; print "prototypes = \n"; print $prototypes; print "current partition matrix = \n"; print $current_partition_matrix; # ================================ # initialize_partition_matrix # partition_matrix = # initialize_partition_matrix( # num_clusters, num_patterns) # ================================ sub initialize_partition_matrix { my ($partition_matrix, $column_sum); $partition_matrix = random($_[1],$_[0]); $column_sum = sumover (mv($partition_matrix, 1, 0));#sum over columns $partition_matrix /= $column_sum; return $partition_matrix; } # ==================================== # compute distance between two vectors # dist = distance( vector1, vector2, /&type_of_distance ) # ==================================== sub distance{ my ($vector1, $vector2, $type_of_distance) = @_; my ($r) = $vector1 - $vector2; $type_of_distance->($r); } sub manhattan{ sum(abs($_[0]));} sub euclidean{ sqrt(sum($_[0] ** 2) );} sub tschebyschev{ max(abs($_[0])); } __DATA__ 4.0 4.0 4.0 5.0 5.0 4.0 5.5 6.0 5.0 5.0 4.5 4.5 5.0 5.5 5.5 5.0 5.0 4.5 4.5 5.0 9.5 9.0 9.0 9.5 8.0 8.0 7.0 8.0 8.0 7.0 8.5 7.0 7.0 8.5 7.0 7.0 7.5 7.0 6.5 8.0 8.0 6.5 6.5 7.0 10.0 10.0 10.0 9.0 10.0 9.0 9.5 10.0 8.0 10.0 9.5 9.5 9.0 9.0 9.0 10.0