It is possible to do cluster analysis in Perl. You could use the functions in the Algorithm::Cluster module that was mentioned before or you could write your own clustering functions using the PDL module that was also mentioned before. However, before getting deep into those modules I recommend you to have a look at the Wikipedia entry on Data Clustering.
For us to help you better, it would be good to have a small sample of your data or at least a better description of their structure.
#!/usr/bin/perl
use warnings;
use strict;
use PDL;
# fcm: fuzzy c-means implementation in Perl
# usage: $fcm [number_of_clusters] [fuzzification_factor]
# [max_iter] [tolerace]
# returns: prototypes, partition_matrix
#
#
# reading data
#
my ( @data, @tmp, $number_of_patterns, $max_row_number, $max_column_nu
+mber );
while (defined(my $line = <DATA>)) {
chomp ($line);
@tmp = split /\s+/, $line;
push @data, [ @tmp ];
}
$number_of_patterns = @data;
my $patterns = pdl(@data);
#
# assigning other variables
#
my $number_of_clusters = shift @ARGV;
my $fuzzification_factor = shift @ARGV;
my $max_iter = shift @ARGV;
my $tolerance = shift @ARGV;
unless (defined($number_of_clusters)) {
$number_of_clusters ||= 2;
}
unless (defined($fuzzification_factor)) {
$fuzzification_factor ||= 2.0;
}
unless (defined($max_iter)) { $max_iter ||= 40; }
unless (defined($tolerance)) { $tolerance ||= 0.00001; }
$number_of_clusters = abs($number_of_clusters);
$fuzzification_factor = abs($fuzzification_factor);
$max_iter = abs($max_iter);
$tolerance = abs($tolerance);
#
# initializing partition matrices
#
my $previous_partition_matrix;
my $current_partition_matrix =
initialize_partition_matrix($number_of_clusters, $number_of_patte
+rns);
#
# output variables
#
my $prototypes;
my $performance_index;
#
# fuzzy c means implementation
#
$max_row_number = $number_of_patterns - 1;
$max_column_number = $number_of_clusters - 1;
my $iter = 0;
while (1) {
# computing each prototype
my $temporal_partition_matrix = $current_partition_matrix ** $fuzz
+ification_factor;
my $temp_prototypes = mv( $temporal_partition_matrix x $patterns,
+1,0) / sumover($temporal_partition_matrix);
$prototypes = mv($temp_prototypes,1,0);
# copying partition matrix
$previous_partition_matrix = $current_partition_matrix->copy;
# updating the partition matrix
my $dist = zeroes $number_of_patterns, $number_of_clusters;
for my $i (0..$max_row_number){
for my $j (0..$max_column_number){
my $temp_distance = distance($patterns->slice(":,$i"), $pr
+ototypes->slice(":,$j"), \&euclidean );
$dist->set($i, $j, $temp_distance);
}
}
my $temp_variable = $dist ** (-2/($fuzzification_factor - 1));
$current_partition_matrix = $temp_variable / sumover(mv($temp_vari
+able,1,0));
#
# Performance Index calculation
#
$temporal_partition_matrix = $current_partition_matrix ** $fuzzifi
+cation_factor;
$performance_index = sum($temporal_partition_matrix * ( $dist ** 2
+ ));
# checking stop conditions
my $diff_partition_matrix = $current_partition_matrix - $previous_
+partition_matrix;
$iter++;
if ( ($diff_partition_matrix->max < $tolerance) || ($iter > $max_i
+ter) ) {
last;
}
print "iter = $iter\n";
}
print "=======================================\n";
print "clustering completed\n";
print "performance index = $performance_index\n";
print "prototypes = \n";
print $prototypes;
print "current partition matrix = \n";
print $current_partition_matrix;
# ================================
# initialize_partition_matrix
# partition_matrix =
# initialize_partition_matrix(
# num_clusters, num_patterns)
# ================================
sub initialize_partition_matrix {
my ($partition_matrix, $column_sum);
$partition_matrix = random($_[1],$_[0]);
$column_sum = sumover (mv($partition_matrix, 1, 0));#sum over column
+s
$partition_matrix /= $column_sum;
return $partition_matrix;
}
# ====================================
# compute distance between two vectors
# dist = distance( vector1, vector2, /&type_of_distance )
# ====================================
sub distance{
my ($vector1, $vector2, $type_of_distance) = @_;
my ($r) = $vector1 - $vector2;
$type_of_distance->($r);
}
sub manhattan{ sum(abs($_[0]));}
sub euclidean{ sqrt(sum($_[0] ** 2) );}
sub tschebyschev{ max(abs($_[0])); }
__DATA__
4.0 4.0
4.0 5.0
5.0 4.0
5.5 6.0
5.0 5.0
4.5 4.5
5.0 5.5
5.5 5.0
5.0 4.5
4.5 5.0
9.5 9.0
9.0 9.5
8.0 8.0
7.0 8.0
8.0 7.0
8.5 7.0
7.0 8.5
7.0 7.0
7.5 7.0
6.5 8.0
8.0 6.5
6.5 7.0
10.0 10.0
10.0 9.0
10.0 9.0
9.5 10.0
8.0 10.0
9.5 9.5
9.0 9.0
9.0 10.0