#!/usr/bin/env perl use strict; use warnings; use PDL; use PDL::GSL::RNG; use PDL::Stats::Basic; use PDL::Ufunc; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my $time_started = time; # RNG seed or use time_started my $seed = 1234; # $time_started; # size of my input sample / data my $N = 10000; # number of bootstrap repeats (re-samplings) my $M = 1000; # number of re-sample size set as the same as input size my $R = $N; print "$0 : starting, N=$N, M=$M, R=$R\n"; # confidence intervals to calculate: my $confidence_intervals_levels = [5/100, 95/100]; # ad-hoc mean and stdev of our input data my $dist_mean = pdl(10.35); # increasing stdev will expand CI my $dist_stdev = pdl(1.231); # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed); # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean: my $X = zeroes($N); $rng->ran_gaussian($dist_stdev, $X); $X += $dist_mean; my $input_data_statistics = [ PDL::Ufunc::avg($X), PDL::Stats::Basic::stdv($X) ]; print "input data statistics:" ."\n\tmean: ".$input_data_statistics->[0] ."\n\tstdev: ".$input_data_statistics->[1] ."\n"; # Now we will re-sample our input $M times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: my $means = zeroes($M); my $stdevs = zeroes($M); for(my $i=0;$i<$M;$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my $asample = resample($X, $rng, $R); $means->set($i, PDL::Ufunc::avg($asample)); $stdevs->set($i, PDL::Stats::Basic::stdv($asample)); } my $estimated_statistics = [ PDL::Ufunc::avg($means), PDL::Ufunc::avg($stdevs) ]; my $discrepancies = [ 100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0], 100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after $M re-samplings):" ."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)" ."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)" ."\n"; # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my $sorted_means = PDL::Ufunc::qsort($means); my $confidence_intervals_values = [ $sorted_means->at(int($confidence_intervals_levels->[0]*$M)), $sorted_means->at(int($confidence_intervals_levels->[1]*$M)) ]; print "confidence intervals after bootstraping $M times with a sample size of $N each time:\n" ."\t".$confidence_intervals_levels->[0]." => ".$confidence_intervals_values->[0]."\n" ."\t".$confidence_intervals_levels->[1]." => ".$confidence_intervals_values->[1]."\n" ; print "done in ".(time-$time_started)." seconds.\n"; # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my $original_sample = $_[0]; my $arng = $_[1]; my $No = $_[2] || $original_sample->getdim(0); my $indices = ($No * $rng->get_uniform($No))->floor(); my $newsample = $original_sample->slice($indices); return $newsample; }