#!/usr/bin/env perl use strict; use warnings; use threads; use threads::shared; use PDL; use PDL::Stats::Basic; use PDL::Ufunc; use PDL::Parallel::threads; use Thread::Queue; # MT random number generator for thread-safe operation # not just Math::Random::MT (which gave me problems) # but Math::Random::MT::Auto (btw neither Math::Random::MTwist/OO works - maybe my fault?) use Math::Random::MT::Auto; use Getopt::Long; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my $input_filename = undef; # default confidence intervals to calculate: my @confidence_intervals_levels = (); my $num_bootstrap_repeats = 1000; my $bootstrap_sample_size = undef; my $max_num_threads = 3; my @demo_params; if( ! Getopt::Long::GetOptions( 'input|i=s' => \$input_filename, 'confidence-intervals|c=f{2}' => \@confidence_intervals_levels, 'num-bootstrap-repeats|R=i' => \$num_bootstrap_repeats, 'bootstrap-sample-size|S=i' => \$bootstrap_sample_size, 'max-num-threads=i' => \$max_num_threads, 'demo=f{3}' => \@demo_params, 'help' => sub { print usage($0); exit(0) } ) ){ print STDERR usage($0) . "\n\n$0 : something wrong with command line parameters.\n"; exit(1) } my $time_started = time; if( scalar(@confidence_intervals_levels) == 0 ){ @confidence_intervals_levels = (5/100, 95/100) } my ($i, $N, $line); # my input data piddle my $X; # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my $rngMT = Math::Random::MT::Auto->new(); if( ! defined($rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed.\n"; exit(1) } if( defined($input_filename) ){ # read data from file: if( ! open(INP, '<', $input_filename) ){ print STDERR "$0 : could not open input file '$input_filename', $!\n"; exit(1) } $i = 0; while( $line = ){ chomp($line); if( ! ($line =~ /^\s*$/) ){ $i++; } } seek(INP, 0, 0); # rewind filehandle $i = 0; while( $line = ){ chomp($line); if( ! ($line =~ /^\s*$/) ){ $X->set($i++, $line) } } close(INP); } elsif( scalar(@demo_params) == 3 ){ # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean for demo $N = $demo_params[2]; print "$0 : creating random data of $N items with mean = ".$demo_params[0]." and stdev = ".$demo_params[1]." ...\n"; $X = zeroes($N); # populates above piddle with rands from gaussian dist with specified mean/stdev # using the RNG obj provided gaussian_rand({ 'mean' => $demo_params[0], 'stdev' => $demo_params[1], 'piddle' => $X, 'rng' => $rngMT }); } else { print STDERR usage($0) . "\n$0 : either an input file must be specified (--input) or use the demo switch as : --demo MEAN STDEV SAMPLE_SIZE\n"; exit(1) } # size of input data $N = $X->getdim(0); $bootstrap_sample_size = $N unless defined($bootstrap_sample_size); print "$0 : starting with\n\tinput sample size: $N\n\tbootstrap repeats: $num_bootstrap_repeats\n\tbootstrap sample size: $bootstrap_sample_size\n\tparallelising over $max_num_threads threads.\n"; my $input_data_statistics = [ PDL::Ufunc::avg($X), PDL::Stats::Basic::stdv($X) ]; print "$0 : input data statistics:" ."\n\tmean: ".$input_data_statistics->[0] ."\n\tstdev: ".$input_data_statistics->[1] ."\n"; # Now we will re-sample our input $bootstrap_sample_size times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: # size will be equal to number of repeats my $means = zeroes($num_bootstrap_repeats); my $stdevs = zeroes($num_bootstrap_repeats); # we need to share these $X->share_as('X'); $means->share_as('means'); $stdevs->share_as('stdevs'); my $Q = Thread::Queue->new(); sub worker { my $tid = threads->tid; my $worker_rngMT = Math::Random::MT::Auto->new(); if( ! defined($worker_rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed for worker $tid.\n"; exit(1) } while( defined(my $item = $Q->dequeue())){ my $shallow_copy_of_X = PDL::Parallel::threads::retrieve_pdls('X'); my $shallow_copy_of_means = PDL::Parallel::threads::retrieve_pdls('means'); my $shallow_copy_of_stdevs = PDL::Parallel::threads::retrieve_pdls('stdevs'); my $asample = resample( $shallow_copy_of_X, $worker_rngMT, $bootstrap_sample_size ); my $amean = PDL::Ufunc::avg($asample); my $astdev = PDL::Stats::Basic::stdv($asample); $shallow_copy_of_means->set($item, $amean); $shallow_copy_of_stdevs->set($item, $astdev); #print "I am worker $tid and getting index=$item : $amean, $astdev\n"; } print "worker $tid is ending\n"; } # worker sub my @tpool = map{ threads->create(\&worker, $Q) } 1 .. $max_num_threads; for(my $i=0;$i<$num_bootstrap_repeats;$i++){ $Q->enqueue($i); } $Q->end(); $_->join for @tpool; #exit(0); my $estimated_statistics = [ PDL::Ufunc::avg($means), PDL::Ufunc::avg($stdevs) ]; =pod my $discrepancies = [ 100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0], 100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after $M re-samplings):" ."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)" ."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)" ."\n"; =cut # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my $sorted_means = PDL::Ufunc::qsort($means); my @confidence_intervals_values = ( $sorted_means->at(int($confidence_intervals_levels[0]*$num_bootstrap_repeats)), $sorted_means->at(int($confidence_intervals_levels[1]*$num_bootstrap_repeats)) ); print "confidence intervals after bootstraping $num_bootstrap_repeats times with a sample size of $N each time:\n" ."\t".$confidence_intervals_levels[0]." => ".$confidence_intervals_values[0]."\n" ."\t".$confidence_intervals_levels[1]." => ".$confidence_intervals_values[1]."\n" ; print "done in ".(time-$time_started)." seconds.\n"; exit(0); sub usage { return "Usage : ".$_[0]." --input file.txt | --demo MEAN STDEV SAMPLE_SIZE [--confidence-intervals LEFT RIGHT (e.g. 0.05 0.95 for 5% and 95%)] [--num-bootstrap-repeats N] [--bootstrap-sample-size S (if not specified it will be the same as input sample size)] [--max-num-threads P]\nExample:\n".$_[0]." --demo 10.32 1.23 1000 --max-num-threads 4 --confidence-intervals 0.075 0.925\n" } # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my $original_sample = $_[0]; my $arng = $_[1]; # RNG to give us random indices to sample my $No = $_[2] || $original_sample->getdim(0); # size of sample my $newsample = zeroes($No); for(my $i=$No;$i-->0;){ $newsample->set($i, $original_sample->at(floor($No * $arng->rand()))) } return $newsample; } # returns n random numbers from the gaussian (normal) distribution # with mean and stdev optionally specified. Default values are mean=0, stdev=1 # An rng object which exports rand([$n]) is required sub gaussian_rand { my $params = $_[0]; my $rngobj; if( ! defined($rngobj=$params->{'rng'}) ){ # now this is weird: # my $rngobj = $params->{'rng'} or die ... fails once in a while! die "rng is required, params were: ".join(",",keys %$params)." and rng was ".$params->{'rng'}."\n"; } my $piddle_to_return_results = $params->{'piddle'}; my $array_to_return_results = $params->{'array'}; my $n = undef; if( defined($piddle_to_return_results) ){ $n = $piddle_to_return_results->getdim(0); } else { if( defined($array_to_return_results) ){ $n = scalar(@$array_to_return_results); } else { $n = $params->{'n'} or die "n is required if no array or piddle references are specified."; $array_to_return_results = [ (0)x$n ]; } } my ($mean, $stdev); if( ! defined($mean=$params->{'mean'}) ){ $mean = 0.0 } if( ! defined($stdev=$params->{'stdev'}) ){ $stdev = 1.0 } # following code is from "Perl Cookbook" (http://www.oreilly.com/catalog/perlckbk2/) my ($u1, $u2); # uniformly distributed random numbers my $w; # variance, then a weight my ($g1, $g2); # gaussian-distributed numbers my $i; for($i=0;$i<($n-1);$i+=2){ do { $u1 = 2 * $rngobj->rand() - 1; $u2 = 2 * $rngobj->rand() - 1; $w = $u1*$u1 + $u2*$u2; } while ( $w >= 1 ); $w = sqrt( (-2 * log($w)) / $w ); $g2 = ($u1 * $w) * $stdev + $mean; $g1 = ($u2 * $w) * $stdev + $mean; if( defined($array_to_return_results) ){ $array_to_return_results->[$i] = $g1; $array_to_return_results->[$i+1] = $g2; } else { $piddle_to_return_results->set($i, $g1); $piddle_to_return_results->set($i+1, $g2); } } if( $n % 2 == 1 ){ $i = $n-1; do { $u1 = 2 * $rngobj->rand() - 1; $u2 = 2 * $rngobj->rand() - 1; $w = $u1*$u1 + $u2*$u2; } while ( $w >= 1 ); $w = sqrt( (-2 * log($w)) / $w ); $g2 = ($u1 * $w) * $stdev + $mean; $g1 = ($u2 * $w) * $stdev + $mean; if( defined($array_to_return_results) ){ $array_to_return_results->[$i] = $g1; } else { $piddle_to_return_results->set($i, $g1); } } return $array_to_return_results; }