#!/usr/bin/env perl use strict; use warnings; use PDL; use PDL::GSL::RNG; use PDL::Stats::Basic; use PDL::Ufunc; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my \$time_started = time; # RNG seed or use time_started my \$seed = 1234; # \$time_started; # size of my input sample / data my \$N = 10000; # number of bootstrap repeats (re-samplings) my \$M = 1000; # number of re-sample size set as the same as input size my \$R = \$N; print "\$0 : starting, N=\$N, M=\$M, R=\$R\n"; # confidence intervals to calculate: my \$confidence_intervals_levels = [5/100, 95/100]; # ad-hoc mean and stdev of our input data my \$dist_mean = pdl(10.35); # increasing stdev will expand CI my \$dist_stdev = pdl(1.231); # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my \$rng = PDL::GSL::RNG->new('taus')->set_seed(\$seed); # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean: my \$X = zeroes(\$N); \$rng->ran_gaussian(\$dist_stdev, \$X); \$X += \$dist_mean; my \$input_data_statistics = [ PDL::Ufunc::avg(\$X), PDL::Stats::Basic::stdv(\$X) ]; print "input data statistics:" ."\n\tmean: ".\$input_data_statistics->[0] ."\n\tstdev: ".\$input_data_statistics->[1] ."\n"; # Now we will re-sample our input \$M times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: my \$means = zeroes(\$M); my \$stdevs = zeroes(\$M); for(my \$i=0;\$i<\$M;\$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my \$asample = resample(\$X, \$rng, \$R); \$means->set(\$i, PDL::Ufunc::avg(\$asample)); \$stdevs->set(\$i, PDL::Stats::Basic::stdv(\$asample)); } my \$estimated_statistics = [ PDL::Ufunc::avg(\$means), PDL::Ufunc::avg(\$stdevs) ]; my \$discrepancies = [ 100 * abs(\$estimated_statistics->[0] - \$input_data_statistics->[0])/\$input_data_statistics->[0], 100 * abs(\$estimated_statistics->[1] - \$input_data_statistics->[1])/\$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after \$M re-samplings):" ."\n\tmean: ".\$estimated_statistics->[0]." (discrepancy: ".\$discrepancies->[0]." %)" ."\n\tstdev: ".\$estimated_statistics->[1]." (discrepancy: ".\$discrepancies->[1]." %)" ."\n"; # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my \$sorted_means = PDL::Ufunc::qsort(\$means); my \$confidence_intervals_values = [ \$sorted_means->at(int(\$confidence_intervals_levels->[0]*\$M)), \$sorted_means->at(int(\$confidence_intervals_levels->[1]*\$M)) ]; print "confidence intervals after bootstraping \$M times with a sample size of \$N each time:\n" ."\t".\$confidence_intervals_levels->[0]." => ".\$confidence_intervals_values->[0]."\n" ."\t".\$confidence_intervals_levels->[1]." => ".\$confidence_intervals_values->[1]."\n" ; print "done in ".(time-\$time_started)." seconds.\n"; # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my \$original_sample = \$_[0]; my \$arng = \$_[1]; my \$No = \$_[2] || \$original_sample->getdim(0); my \$indices = (\$No * \$rng->get_uniform(\$No))->floor(); my \$newsample = \$original_sample->slice(\$indices); return \$newsample; } ##```## # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my \$original_sample = \$_[0]; my \$arng = \$_[1]; my \$No = \$_[2] || \$original_sample->getdim(0); my \$indices = (\$No * \$rng->get_uniform(\$No))->floor(); my \$newsample = \$original_sample->slice(\$indices); return \$newsample; } my \$M = 1000; # num bootstrap resamplings my \$X = ... ; # input data piddle (1D) my \$R = \$X->getdim(0); # size of bootstrap resamples my \$rng = PDL::GSL::RNG->new('taus')->set_seed(\$seed); my \$means = zeroes(\$M); for(my \$i=0;\$i<\$M;\$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my \$asample = resample(\$X, \$rng, \$R); \$means->set(\$i, PDL::Ufunc::avg(\$asample)); } # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my \$sorted_means = PDL::Ufunc::qsort(\$means); my \$confidence_intervals_values = [ \$sorted_means->at(int(0.05*\$M)), \$sorted_means->at(int(0.95*\$M)) ]; ##``````## #!/usr/bin/env perl use strict; use warnings; use threads; use threads::shared; use PDL; use PDL::Stats::Basic; use PDL::Ufunc; use PDL::Parallel::threads; use Thread::Queue; # MT random number generator for thread-safe operation # not just Math::Random::MT (which gave me problems) # but Math::Random::MT::Auto (btw neither Math::Random::MTwist/OO works - maybe my fault?) use Math::Random::MT::Auto; use Getopt::Long; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my \$input_filename = undef; # default confidence intervals to calculate: my @confidence_intervals_levels = (); my \$num_bootstrap_repeats = 1000; my \$bootstrap_sample_size = undef; my \$max_num_threads = 3; my @demo_params; if( ! Getopt::Long::GetOptions( 'input|i=s' => \\$input_filename, 'confidence-intervals|c=f{2}' => \@confidence_intervals_levels, 'num-bootstrap-repeats|R=i' => \\$num_bootstrap_repeats, 'bootstrap-sample-size|S=i' => \\$bootstrap_sample_size, 'max-num-threads=i' => \\$max_num_threads, 'demo=f{3}' => \@demo_params, 'help' => sub { print usage(\$0); exit(0) } ) ){ print STDERR usage(\$0) . "\n\n\$0 : something wrong with command line parameters.\n"; exit(1) } my \$time_started = time; if( scalar(@confidence_intervals_levels) == 0 ){ @confidence_intervals_levels = (5/100, 95/100) } my (\$i, \$N, \$line); # my input data piddle my \$X; # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my \$rngMT = Math::Random::MT::Auto->new(); if( ! defined(\$rngMT) ){ print STDERR "\$0 : call to ".'Math::Random::MT::Auto->new()'." has failed.\n"; exit(1) } if( defined(\$input_filename) ){ # read data from file: if( ! open(INP, '<', \$input_filename) ){ print STDERR "\$0 : could not open input file '\$input_filename', \$!\n"; exit(1) } \$i = 0; while( \$line = ){ chomp(\$line); if( ! (\$line =~ /^\s*\$/) ){ \$i++; } } seek(INP, 0, 0); # rewind filehandle \$i = 0; while( \$line = ){ chomp(\$line); if( ! (\$line =~ /^\s*\$/) ){ \$X->set(\$i++, \$line) } } close(INP); } elsif( scalar(@demo_params) == 3 ){ # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean for demo \$N = \$demo_params[2]; print "\$0 : creating random data of \$N items with mean = ".\$demo_params[0]." and stdev = ".\$demo_params[1]." ...\n"; \$X = zeroes(\$N); # populates above piddle with rands from gaussian dist with specified mean/stdev # using the RNG obj provided gaussian_rand({ 'mean' => \$demo_params[0], 'stdev' => \$demo_params[1], 'piddle' => \$X, 'rng' => \$rngMT }); } else { print STDERR usage(\$0) . "\n\$0 : either an input file must be specified (--input) or use the demo switch as : --demo MEAN STDEV SAMPLE_SIZE\n"; exit(1) } # size of input data \$N = \$X->getdim(0); \$bootstrap_sample_size = \$N unless defined(\$bootstrap_sample_size); print "\$0 : starting with\n\tinput sample size: \$N\n\tbootstrap repeats: \$num_bootstrap_repeats\n\tbootstrap sample size: \$bootstrap_sample_size\n\tparallelising over \$max_num_threads threads.\n"; my \$input_data_statistics = [ PDL::Ufunc::avg(\$X), PDL::Stats::Basic::stdv(\$X) ]; print "\$0 : input data statistics:" ."\n\tmean: ".\$input_data_statistics->[0] ."\n\tstdev: ".\$input_data_statistics->[1] ."\n"; # Now we will re-sample our input \$bootstrap_sample_size times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: # size will be equal to number of repeats my \$means = zeroes(\$num_bootstrap_repeats); my \$stdevs = zeroes(\$num_bootstrap_repeats); # we need to share these \$X->share_as('X'); \$means->share_as('means'); \$stdevs->share_as('stdevs'); my \$Q = Thread::Queue->new(); sub worker { my \$tid = threads->tid; my \$worker_rngMT = Math::Random::MT::Auto->new(); if( ! defined(\$worker_rngMT) ){ print STDERR "\$0 : call to ".'Math::Random::MT::Auto->new()'." has failed for worker \$tid.\n"; exit(1) } while( defined(my \$item = \$Q->dequeue())){ my \$shallow_copy_of_X = PDL::Parallel::threads::retrieve_pdls('X'); my \$shallow_copy_of_means = PDL::Parallel::threads::retrieve_pdls('means'); my \$shallow_copy_of_stdevs = PDL::Parallel::threads::retrieve_pdls('stdevs'); my \$asample = resample( \$shallow_copy_of_X, \$worker_rngMT, \$bootstrap_sample_size ); my \$amean = PDL::Ufunc::avg(\$asample); my \$astdev = PDL::Stats::Basic::stdv(\$asample); \$shallow_copy_of_means->set(\$item, \$amean); \$shallow_copy_of_stdevs->set(\$item, \$astdev); #print "I am worker \$tid and getting index=\$item : \$amean, \$astdev\n"; } print "worker \$tid is ending\n"; } # worker sub my @tpool = map{ threads->create(\&worker, \$Q) } 1 .. \$max_num_threads; for(my \$i=0;\$i<\$num_bootstrap_repeats;\$i++){ \$Q->enqueue(\$i); } \$Q->end(); \$_->join for @tpool; #exit(0); my \$estimated_statistics = [ PDL::Ufunc::avg(\$means), PDL::Ufunc::avg(\$stdevs) ]; =pod my \$discrepancies = [ 100 * abs(\$estimated_statistics->[0] - \$input_data_statistics->[0])/\$input_data_statistics->[0], 100 * abs(\$estimated_statistics->[1] - \$input_data_statistics->[1])/\$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after \$M re-samplings):" ."\n\tmean: ".\$estimated_statistics->[0]." (discrepancy: ".\$discrepancies->[0]." %)" ."\n\tstdev: ".\$estimated_statistics->[1]." (discrepancy: ".\$discrepancies->[1]." %)" ."\n"; =cut # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my \$sorted_means = PDL::Ufunc::qsort(\$means); my @confidence_intervals_values = ( \$sorted_means->at(int(\$confidence_intervals_levels[0]*\$num_bootstrap_repeats)), \$sorted_means->at(int(\$confidence_intervals_levels[1]*\$num_bootstrap_repeats)) ); print "confidence intervals after bootstraping \$num_bootstrap_repeats times with a sample size of \$N each time:\n" ."\t".\$confidence_intervals_levels[0]." => ".\$confidence_intervals_values[0]."\n" ."\t".\$confidence_intervals_levels[1]." => ".\$confidence_intervals_values[1]."\n" ; print "done in ".(time-\$time_started)." seconds.\n"; exit(0); sub usage { return "Usage : ".\$_[0]." --input file.txt | --demo MEAN STDEV SAMPLE_SIZE [--confidence-intervals LEFT RIGHT (e.g. 0.05 0.95 for 5% and 95%)] [--num-bootstrap-repeats N] [--bootstrap-sample-size S (if not specified it will be the same as input sample size)] [--max-num-threads P]\nExample:\n".\$_[0]." --demo 10.32 1.23 1000 --max-num-threads 4 --confidence-intervals 0.075 0.925\n" } # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my \$original_sample = \$_[0]; my \$arng = \$_[1]; # RNG to give us random indices to sample my \$No = \$_[2] || \$original_sample->getdim(0); # size of sample my \$newsample = zeroes(\$No); for(my \$i=\$No;\$i-->0;){ \$newsample->set(\$i, \$original_sample->at(floor(\$No * \$arng->rand()))) } return \$newsample; } # returns n random numbers from the gaussian (normal) distribution # with mean and stdev optionally specified. Default values are mean=0, stdev=1 # An rng object which exports rand([\$n]) is required sub gaussian_rand { my \$params = \$_[0]; my \$rngobj; if( ! defined(\$rngobj=\$params->{'rng'}) ){ # now this is weird: # my \$rngobj = \$params->{'rng'} or die ... fails once in a while! die "rng is required, params were: ".join(",",keys %\$params)." and rng was ".\$params->{'rng'}."\n"; } my \$piddle_to_return_results = \$params->{'piddle'}; my \$array_to_return_results = \$params->{'array'}; my \$n = undef; if( defined(\$piddle_to_return_results) ){ \$n = \$piddle_to_return_results->getdim(0); } else { if( defined(\$array_to_return_results) ){ \$n = scalar(@\$array_to_return_results); } else { \$n = \$params->{'n'} or die "n is required if no array or piddle references are specified."; \$array_to_return_results = [ (0)x\$n ]; } } my (\$mean, \$stdev); if( ! defined(\$mean=\$params->{'mean'}) ){ \$mean = 0.0 } if( ! defined(\$stdev=\$params->{'stdev'}) ){ \$stdev = 1.0 } # following code is from "Perl Cookbook" (http://www.oreilly.com/catalog/perlckbk2/) my (\$u1, \$u2); # uniformly distributed random numbers my \$w; # variance, then a weight my (\$g1, \$g2); # gaussian-distributed numbers my \$i; for(\$i=0;\$i<(\$n-1);\$i+=2){ do { \$u1 = 2 * \$rngobj->rand() - 1; \$u2 = 2 * \$rngobj->rand() - 1; \$w = \$u1*\$u1 + \$u2*\$u2; } while ( \$w >= 1 ); \$w = sqrt( (-2 * log(\$w)) / \$w ); \$g2 = (\$u1 * \$w) * \$stdev + \$mean; \$g1 = (\$u2 * \$w) * \$stdev + \$mean; if( defined(\$array_to_return_results) ){ \$array_to_return_results->[\$i] = \$g1; \$array_to_return_results->[\$i+1] = \$g2; } else { \$piddle_to_return_results->set(\$i, \$g1); \$piddle_to_return_results->set(\$i+1, \$g2); } } if( \$n % 2 == 1 ){ \$i = \$n-1; do { \$u1 = 2 * \$rngobj->rand() - 1; \$u2 = 2 * \$rngobj->rand() - 1; \$w = \$u1*\$u1 + \$u2*\$u2; } while ( \$w >= 1 ); \$w = sqrt( (-2 * log(\$w)) / \$w ); \$g2 = (\$u1 * \$w) * \$stdev + \$mean; \$g1 = (\$u2 * \$w) * \$stdev + \$mean; if( defined(\$array_to_return_results) ){ \$array_to_return_results->[\$i] = \$g1; } else { \$piddle_to_return_results->set(\$i, \$g1); } } return \$array_to_return_results; } ```