#!/usr/bin/env perl
use strict;
use warnings;
use PDL;
use PDL::GSL::RNG;
use PDL::Stats::Basic;
use PDL::Ufunc;
####
# program to get an input data vector (1D)
# and calculate its statistics (mean, stdev)
# and confidence intervals using Bootstrap.
# For demonstration the input data is generated randomly
# with specified mean/stdev from Gaussian distribution.
# Alternatively one can populate the input as sees fit.
# Improving accuracy means increasing the re-sample size
# as well as the number of bootstrap iterations (R and M)
#
# Bootstrap is a non-parametric method to estimate sample
# statistics (unlike a t-test for example which ASSUMES that our
# input data comes from a Gaussian distribution) and it is roughly based
# on re-sampling our data many times (this is where the computer comes
# in and why this method is popular today) and each time
# calculating the statistics (mean, stdev, etc).
# Q: What is a confidence interval (CI)?
# Without getting into too much detail, a CI (as a range of sample values)
# of Y% is a property
# of our sample which says that if you draw random items from
# your sample, their value will be within the CI, Y% of the times.
# See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp
# more info: http://www.stat.wmich.edu/s160/book/node48.html
#
# author: bliako
# date: 03/05/2018
####
my $time_started = time;
# RNG seed or use time_started
my $seed = 1234; # $time_started;
# size of my input sample / data
my $N = 10000;
# number of bootstrap repeats (re-samplings)
my $M = 1000;
# number of re-sample size set as the same as input size
my $R = $N;
print "$0 : starting, N=$N, M=$M, R=$R\n";
# confidence intervals to calculate:
my $confidence_intervals_levels = [5/100, 95/100];
# ad-hoc mean and stdev of our input data
my $dist_mean = pdl(10.35);
# increasing stdev will expand CI
my $dist_stdev = pdl(1.231);
# produce the Random Number Generator to generate random data
# to populate our input sample (just for demo):
my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed);
# produce a random sample from a Gaussian (Normal) distribution
# with specified stdev and mean:
my $X = zeroes($N);
$rng->ran_gaussian($dist_stdev, $X);
$X += $dist_mean;
my $input_data_statistics = [
PDL::Ufunc::avg($X),
PDL::Stats::Basic::stdv($X)
];
print "input data statistics:"
."\n\tmean: ".$input_data_statistics->[0]
."\n\tstdev: ".$input_data_statistics->[1]
."\n";
# Now we will re-sample our input $M times and each time
# we will record the statistic of interest,
# e.g. for demo this will be MEAN (=average)
# arrays to store the results of each iteration wrt mean and stdev:
my $means = zeroes($M);
my $stdevs = zeroes($M);
for(my $i=0;$i<$M;$i++){
# get a re-sample from original data with size R
# use our RNG to do the re-sampling:
my $asample = resample($X, $rng, $R);
$means->set($i, PDL::Ufunc::avg($asample));
$stdevs->set($i, PDL::Stats::Basic::stdv($asample));
}
my $estimated_statistics = [
PDL::Ufunc::avg($means),
PDL::Ufunc::avg($stdevs)
];
my $discrepancies = [
100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0],
100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1]
];
print "estimated statistics by bootstrap (after $M re-samplings):"
."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)"
."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)"
."\n";
# now sort the means vector and pick the elements at
# the confidence intervals specified, e.g. 5%
my $sorted_means = PDL::Ufunc::qsort($means);
my $confidence_intervals_values = [
$sorted_means->at(int($confidence_intervals_levels->[0]*$M)),
$sorted_means->at(int($confidence_intervals_levels->[1]*$M))
];
print "confidence intervals after bootstraping $M times with a sample size of $N each time:\n"
."\t".$confidence_intervals_levels->[0]." => ".$confidence_intervals_values->[0]."\n"
."\t".$confidence_intervals_levels->[1]." => ".$confidence_intervals_values->[1]."\n"
;
print "done in ".(time-$time_started)." seconds.\n";
# get a random sample with replacement from input vector
# 2nd param is a RNG object which will select N random
# elements (with replacement) from input vector
# 3rd param is OPTIONAL: the size of the sample,
# default is the size of the input vector
sub resample {
my $original_sample = $_[0];
my $arng = $_[1];
my $No = $_[2] || $original_sample->getdim(0);
my $indices = ($No * $rng->get_uniform($No))->floor();
my $newsample = $original_sample->slice($indices);
return $newsample;
}