Here is an attempt at implementing the Hampel identifier method that is mentioned in the article. However, reliable identification of outliers is problematic with so few datapoints.
#!/usr/bin/env perl
use strict;
use warnings;
use Statistics::Descriptive;
use List::Util qw/min max/;
my $stat = Statistics::Descriptive::Full->new();
#my @data = (4, 4, 3, 2); # "A" data
#my @data = (1, 5, 6); # "B" data
#my @data = (1, 80000, 2, 4, 1200); # "C" data
my @data = (0.1, 1500, 1700, 2100, 3200); # "D" data
print "Starting data: ", join(", ", @data), "\n\n";
$stat->add_data(@data);
# References
# http://exploringdatablog.blogspot.com/2013/02/finding-outliers-in-nu
+merical-data.html
# https://en.wikipedia.org/wiki/Median_absolute_deviation
my $median = $stat->median();
my @abs_res = map { abs($median - $_) } @data;
my $abs_res_stat = Statistics::Descriptive::Full->new();
$abs_res_stat->add_data(@abs_res);
my $MAD = $abs_res_stat->median();
my $t = 3;
my $lower_limit = $median-$t*$MAD;
my $upper_limit = $median+$t*$MAD;
print " Median: $median\n";
print " MAD: $MAD\n";
print " t: $t\n\n";
print "Lower limit: $lower_limit\n";
print "Upper Limit: $upper_limit\n\n";
my @filtered_data;
foreach my $datum (@data) {
my $is_outlier = (($datum < $lower_limit) or ($datum > $upper_limi
+t)) ? 1 : 0;
unless($is_outlier) {
push @filtered_data, $datum
};
}
print "Filtered data: ", join(", ", @filtered_data), "\n\n";
print "Minimum value of filtered data is: ", min(@filtered_data), "\n
+";
exit;
|