note bliako <p>Good idea!</p> <p>Here is my take on #100 (TODO)</p> <p>The long version as a standalone program:</p> <readmore> <code> #!/usr/bin/env perl use strict; use warnings; use PDL; use PDL::GSL::RNG; use PDL::Stats::Basic; use PDL::Ufunc; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my $time_started = time; # RNG seed or use time_started my $seed = 1234; # $time_started; # size of my input sample / data my $N = 10000; # number of bootstrap repeats (re-samplings) my $M = 1000; # number of re-sample size set as the same as input size my $R = $N; print "$0 : starting, N=$N, M=$M, R=$R\n"; # confidence intervals to calculate: my $confidence_intervals_levels = [5/100, 95/100]; # ad-hoc mean and stdev of our input data my $dist_mean = pdl(10.35); # increasing stdev will expand CI my $dist_stdev = pdl(1.231); # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed); # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean: my $X = zeroes($N); $rng->ran_gaussian($dist_stdev, $X); $X += $dist_mean; my $input_data_statistics = [ PDL::Ufunc::avg($X), PDL::Stats::Basic::stdv($X) ]; print "input data statistics:" ."\n\tmean: ".$input_data_statistics->[0] ."\n\tstdev: ".$input_data_statistics->[1] ."\n"; # Now we will re-sample our input $M times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: my $means = zeroes($M); my $stdevs = zeroes($M); for(my $i=0;$i<$M;$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my $asample = resample($X, $rng, $R); $means->set($i, PDL::Ufunc::avg($asample)); $stdevs->set($i, PDL::Stats::Basic::stdv($asample)); } my $estimated_statistics = [ PDL::Ufunc::avg($means), PDL::Ufunc::avg($stdevs) ]; my $discrepancies = [ 100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0], 100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after $M re-samplings):" ."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)" ."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)" ."\n"; # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my $sorted_means = PDL::Ufunc::qsort($means); my $confidence_intervals_values = [ $sorted_means->at(int($confidence_intervals_levels->[0]*$M)), $sorted_means->at(int($confidence_intervals_levels->[1]*$M)) ]; print "confidence intervals after bootstraping $M times with a sample size of $N each time:\n" ."\t".$confidence_intervals_levels->[0]." => ".$confidence_intervals_values->[0]."\n" ."\t".$confidence_intervals_levels->[1]." => ".$confidence_intervals_values->[1]."\n" ; print "done in ".(time-$time_started)." seconds.\n"; # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my $original_sample = $_[0]; my $arng = $_[1]; my $No = $_[2] || $original_sample->getdim(0); my $indices = ($No * $rng->get_uniform($No))->floor(); my $newsample = $original_sample->slice($indices); return $newsample; } </code> </readmore> <p>The short version:</p> <code> # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my $original_sample = $_[0]; my $arng = $_[1]; my $No = $_[2] || $original_sample->getdim(0); my $indices = ($No * $rng->get_uniform($No))->floor(); my $newsample = $original_sample->slice($indices); return $newsample; } my $M = 1000; # num bootstrap resamplings my $X = ... ; # input data piddle (1D) my $R = $X->getdim(0); # size of bootstrap resamples my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed); my $means = zeroes($M); for(my $i=0;$i<$M;$i++){ # get a re-sample from original data with size R # use our RNG to do the re-sampling: my $asample = resample($X, $rng, $R); $means->set($i, PDL::Ufunc::avg($asample)); } # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my $sorted_means = PDL::Ufunc::qsort($means); my $confidence_intervals_values = [ $sorted_means->at(int(0.05*$M)), $sorted_means->at(int(0.95*$M)) ]; </code> <h3>EDIT 1:</h3> <p>The above script can easily be parallelised. Input data is readonly (X). Each worker writes results (mean, stdev) to the same piddle but at different array locations guaranteed. So, there is no need for locking afaics.</p> <p>Unfortunately I can not get it to parallelise with threads because PDL::GSL::RNG seems not to like threads. The RNG is needed in order to get a random sample from the original data on every bootstrap iteration. In fact each worker/thread can have its own RNG, not a copy but a different RNG local to each thread. However, even like this I get the dreaded <code>pointer being freed was not allocated *** set a breakpoint in malloc_error_break to debug</code> </p> <p>Any ideas?</p> <h3>EDIT 2: the parallelised version as a standalone program:</h3> <readmore> <code> #!/usr/bin/env perl use strict; use warnings; use threads; use threads::shared; use PDL; use PDL::Stats::Basic; use PDL::Ufunc; use PDL::Parallel::threads; use Thread::Queue; # MT random number generator for thread-safe operation # not just Math::Random::MT (which gave me problems) # but Math::Random::MT::Auto (btw neither Math::Random::MTwist/OO works - maybe my fault?) use Math::Random::MT::Auto; use Getopt::Long; #### # program to get an input data vector (1D) # and calculate its statistics (mean, stdev) # and confidence intervals using Bootstrap. # For demonstration the input data is generated randomly # with specified mean/stdev from Gaussian distribution. # Alternatively one can populate the input as sees fit. # Improving accuracy means increasing the re-sample size # as well as the number of bootstrap iterations (R and M) # # Bootstrap is a non-parametric method to estimate sample # statistics (unlike a t-test for example which ASSUMES that our # input data comes from a Gaussian distribution) and it is roughly based # on re-sampling our data many times (this is where the computer comes # in and why this method is popular today) and each time # calculating the statistics (mean, stdev, etc). # Q: What is a confidence interval (CI)? # Without getting into too much detail, a CI (as a range of sample values) # of Y% is a property # of our sample which says that if you draw random items from # your sample, their value will be within the CI, Y% of the times. # See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp # more info: http://www.stat.wmich.edu/s160/book/node48.html # # author: bliako # date: 03/05/2018 #### my $input_filename = undef; # default confidence intervals to calculate: my @confidence_intervals_levels = (); my $num_bootstrap_repeats = 1000; my $bootstrap_sample_size = undef; my $max_num_threads = 3; my @demo_params; if( ! Getopt::Long::GetOptions( 'input|i=s' => \$input_filename, 'confidence-intervals|c=f{2}' => \@confidence_intervals_levels, 'num-bootstrap-repeats|R=i' => \$num_bootstrap_repeats, 'bootstrap-sample-size|S=i' => \$bootstrap_sample_size, 'max-num-threads=i' => \$max_num_threads, 'demo=f{3}' => \@demo_params, 'help' => sub { print usage($0); exit(0) } ) ){ print STDERR usage($0) . "\n\n$0 : something wrong with command line parameters.\n"; exit(1) } my $time_started = time; if( scalar(@confidence_intervals_levels) == 0 ){ @confidence_intervals_levels = (5/100, 95/100) } my ($i, $N, $line); # my input data piddle my $X; # produce the Random Number Generator to generate random data # to populate our input sample (just for demo): my $rngMT = Math::Random::MT::Auto->new(); if( ! defined($rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed.\n"; exit(1) } if( defined($input_filename) ){ # read data from file: if( ! open(INP, '<', $input_filename) ){ print STDERR "$0 : could not open input file '$input_filename', $!\n"; exit(1) } $i = 0; while( $line = <INP> ){ chomp($line); if( ! ($line =~ /^\s*$/) ){ $i++; } } seek(INP, 0, 0); # rewind filehandle $i = 0; while( $line = <INP> ){ chomp($line); if( ! ($line =~ /^\s*$/) ){ $X->set($i++, $line) } } close(INP); } elsif( scalar(@demo_params) == 3 ){ # produce a random sample from a Gaussian (Normal) distribution # with specified stdev and mean for demo $N = $demo_params[2]; print "$0 : creating random data of $N items with mean = ".$demo_params[0]." and stdev = ".$demo_params[1]." ...\n"; $X = zeroes($N); # populates above piddle with rands from gaussian dist with specified mean/stdev # using the RNG obj provided gaussian_rand({ 'mean' => $demo_params[0], 'stdev' => $demo_params[1], 'piddle' => $X, 'rng' => $rngMT }); } else { print STDERR usage($0) . "\n$0 : either an input file must be specified (--input) or use the demo switch as : --demo MEAN STDEV SAMPLE_SIZE\n"; exit(1) } # size of input data $N = $X->getdim(0); $bootstrap_sample_size = $N unless defined($bootstrap_sample_size); print "$0 : starting with\n\tinput sample size: $N\n\tbootstrap repeats: $num_bootstrap_repeats\n\tbootstrap sample size: $bootstrap_sample_size\n\tparallelising over $max_num_threads threads.\n"; my $input_data_statistics = [ PDL::Ufunc::avg($X), PDL::Stats::Basic::stdv($X) ]; print "$0 : input data statistics:" ."\n\tmean: ".$input_data_statistics->[0] ."\n\tstdev: ".$input_data_statistics->[1] ."\n"; # Now we will re-sample our input $bootstrap_sample_size times and each time # we will record the statistic of interest, # e.g. for demo this will be MEAN (=average) # arrays to store the results of each iteration wrt mean and stdev: # size will be equal to number of repeats my $means = zeroes($num_bootstrap_repeats); my $stdevs = zeroes($num_bootstrap_repeats); # we need to share these $X->share_as('X'); $means->share_as('means'); $stdevs->share_as('stdevs'); my $Q = Thread::Queue->new(); sub worker { my $tid = threads->tid; my $worker_rngMT = Math::Random::MT::Auto->new(); if( ! defined($worker_rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed for worker $tid.\n"; exit(1) } while( defined(my $item = $Q->dequeue())){ my $shallow_copy_of_X = PDL::Parallel::threads::retrieve_pdls('X'); my $shallow_copy_of_means = PDL::Parallel::threads::retrieve_pdls('means'); my $shallow_copy_of_stdevs = PDL::Parallel::threads::retrieve_pdls('stdevs'); my $asample = resample( $shallow_copy_of_X, $worker_rngMT, $bootstrap_sample_size ); my $amean = PDL::Ufunc::avg($asample); my $astdev = PDL::Stats::Basic::stdv($asample); $shallow_copy_of_means->set($item, $amean); $shallow_copy_of_stdevs->set($item, $astdev); #print "I am worker $tid and getting index=$item : $amean, $astdev\n"; } print "worker $tid is ending\n"; } # worker sub my @tpool = map{ threads->create(\&worker, $Q) } 1 .. $max_num_threads; for(my $i=0;$i<$num_bootstrap_repeats;$i++){ $Q->enqueue($i); } $Q->end(); $_->join for @tpool; #exit(0); my $estimated_statistics = [ PDL::Ufunc::avg($means), PDL::Ufunc::avg($stdevs) ]; =pod my $discrepancies = [ 100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0], 100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1] ]; print "estimated statistics by bootstrap (after $M re-samplings):" ."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)" ."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)" ."\n"; =cut # now sort the means vector and pick the elements at # the confidence intervals specified, e.g. 5% my $sorted_means = PDL::Ufunc::qsort($means); my @confidence_intervals_values = ( $sorted_means->at(int($confidence_intervals_levels[0]*$num_bootstrap_repeats)), $sorted_means->at(int($confidence_intervals_levels[1]*$num_bootstrap_repeats)) ); print "confidence intervals after bootstraping $num_bootstrap_repeats times with a sample size of $N each time:\n" ."\t".$confidence_intervals_levels[0]." => ".$confidence_intervals_values[0]."\n" ."\t".$confidence_intervals_levels[1]." => ".$confidence_intervals_values[1]."\n" ; print "done in ".(time-$time_started)." seconds.\n"; exit(0); sub usage { return "Usage : ".$_[0]." --input file.txt | --demo MEAN STDEV SAMPLE_SIZE [--confidence-intervals LEFT RIGHT (e.g. 0.05 0.95 for 5% and 95%)] [--num-bootstrap-repeats N] [--bootstrap-sample-size S (if not specified it will be the same as input sample size)] [--max-num-threads P]\nExample:\n".$_[0]." --demo 10.32 1.23 1000 --max-num-threads 4 --confidence-intervals 0.075 0.925\n" } # get a random sample with replacement from input vector # 2nd param is a RNG object which will select N random # elements (with replacement) from input vector # 3rd param is OPTIONAL: the size of the sample, # default is the size of the input vector sub resample { my $original_sample = $_[0]; my $arng = $_[1]; # RNG to give us random indices to sample my $No = $_[2] || $original_sample->getdim(0); # size of sample my $newsample = zeroes($No); for(my $i=$No;$i-->0;){ $newsample->set($i, $original_sample->at(floor($No * $arng->rand()))) } return $newsample; } # returns n random numbers from the gaussian (normal) distribution # with mean and stdev optionally specified. Default values are mean=0, stdev=1 # An rng object which exports rand([$n]) is required sub gaussian_rand { my $params = $_[0]; my $rngobj; if( ! defined($rngobj=$params->{'rng'}) ){ # now this is weird: # my $rngobj = $params->{'rng'} or die ... fails once in a while! die "rng is required, params were: ".join(",",keys %$params)." and rng was ".$params->{'rng'}."\n"; } my $piddle_to_return_results = $params->{'piddle'}; my $array_to_return_results = $params->{'array'}; my $n = undef; if( defined($piddle_to_return_results) ){ $n = $piddle_to_return_results->getdim(0); } else { if( defined($array_to_return_results) ){ $n = scalar(@$array_to_return_results); } else { $n = $params->{'n'} or die "n is required if no array or piddle references are specified."; $array_to_return_results = [ (0)x$n ]; } } my ($mean, $stdev); if( ! defined($mean=$params->{'mean'}) ){ $mean = 0.0 } if( ! defined($stdev=$params->{'stdev'}) ){ $stdev = 1.0 } # following code is from "Perl Cookbook" (http://www.oreilly.com/catalog/perlckbk2/) my ($u1, $u2); # uniformly distributed random numbers my $w; # variance, then a weight my ($g1, $g2); # gaussian-distributed numbers my $i; for($i=0;$i<($n-1);$i+=2){ do { $u1 = 2 * $rngobj->rand() - 1; $u2 = 2 * $rngobj->rand() - 1; $w = $u1*$u1 + $u2*$u2; } while ( $w >= 1 ); $w = sqrt( (-2 * log($w)) / $w ); $g2 = ($u1 * $w) * $stdev + $mean; $g1 = ($u2 * $w) * $stdev + $mean; if( defined($array_to_return_results) ){ $array_to_return_results->[$i] = $g1; $array_to_return_results->[$i+1] = $g2; } else { $piddle_to_return_results->set($i, $g1); $piddle_to_return_results->set($i+1, $g2); } } if( $n % 2 == 1 ){ $i = $n-1; do { $u1 = 2 * $rngobj->rand() - 1; $u2 = 2 * $rngobj->rand() - 1; $w = $u1*$u1 + $u2*$u2; } while ( $w >= 1 ); $w = sqrt( (-2 * log($w)) / $w ); $g2 = ($u1 * $w) * $stdev + $mean; $g1 = ($u2 * $w) * $stdev + $mean; if( defined($array_to_return_results) ){ $array_to_return_results->[$i] = $g1; } else { $piddle_to_return_results->set($i, $g1); } } return $array_to_return_results; } </code> </readmore> <p>corrections welcome (especially on good practices about how to parallelise)</p> 1213987 1213987