#!/usr/bin/env perl use strict; use warnings; use PDL; use PDL::GSL::RNG; use PDL::Stats::Basic; use PDL::Ufunc; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my \$time_started = time; # RNG seed or use time_started my \$seed = 1234; # \$time_started; # size of my input sample / data my \$N = 10000; # number of bootstrap repeats (re-samplings) my \$M = 1000; # number of re-sample size set as the same as input size my \$R = \$N; print "\$0 : starting, N=\$N, M=\$M, R=\$R\n"; # confidence intervals to calculate: my \$confidence_intervals_levels = [5/100, 95/100]; # ad-hoc mean and stdev of our input data my \$dist_mean = pdl(10.35); # increasing stdev will expand CI my \$dist_stdev = pdl(1.231); # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my \$rng = PDL::GSL::RNG->new('taus')->set_seed(\$seed); # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean: my \$X = zeroes(\$N); \$rng->ran_gaussian(\$dist_stdev, \$X); \$X += \$dist_mean; my \$input_data_statistics = [ PDL::Ufunc::avg(\$X), PDL::Stats::Basic::stdv(\$X) ]; print "input data statistics:" ."\n\tmean: ".\$input_data_statistics->[0] ."\n\tstdev: ".\$input_data_statistics->[1] ."\n"; # Now we will re-sample our input \$M times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: my \$means = zeroes(\$M); my \$stdevs = zeroes(\$M); for(my \$i=0;\$i<\$M;\$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my \$asample = resample(\$X, \$rng, \$R); \$means->set(\$i, PDL::Ufunc::avg(\$asample)); \$stdevs->set(\$i, PDL::Stats::Basic::stdv(\$asample)); } my \$estimated_statistics = [ PDL::Ufunc::avg(\$means), PDL::Ufunc::avg(\$stdevs) ]; my \$discrepancies = [ 100 * abs(\$estimated_statistics->[0] - \$input_data_statistics->[0])/\$input_data_statistics->[0], 100 * abs(\$estimated_statistics->[1] - \$input_data_statistics->[1])/\$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after \$M re-samplings):" ."\n\tmean: ".\$estimated_statistics->[0]." (discrepancy: ".\$discrepancies->[0]." %)" ."\n\tstdev: ".\$estimated_statistics->[1]." (discrepancy: ".\$discrepancies->[1]." %)" ."\n"; # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my \$sorted_means = PDL::Ufunc::qsort(\$means); my \$confidence_intervals_values = [ \$sorted_means->at(int(\$confidence_intervals_levels->[0]*\$M)), \$sorted_means->at(int(\$confidence_intervals_levels->[1]*\$M)) ]; print "confidence intervals after bootstraping \$M times with a sample size of \$N each time:\n" ."\t".\$confidence_intervals_levels->[0]." => ".\$confidence_intervals_values->[0]."\n" ."\t".\$confidence_intervals_levels->[1]." => ".\$confidence_intervals_values->[1]."\n" ; print "done in ".(time-\$time_started)." seconds.\n"; # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my \$original_sample = \$_[0]; my \$arng = \$_[1]; my \$No = \$_[2] || \$original_sample->getdim(0); my \$indices = (\$No * \$rng->get_uniform(\$No))->floor(); my \$newsample = \$original_sample->slice(\$indices); return \$newsample; }