#!/usr/bin/env perl
use strict;
use warnings;
use PDL;
use PDL::GSL::RNG;
use PDL::Stats::Basic;
use PDL::Ufunc;
####
# program to get an input data vector (1D)
# and calculate its statistics (mean, stdev)
# and confidence intervals using Bootstrap.
# For demonstration the input data is generated randomly
# with specified mean/stdev from Gaussian distribution.
# Alternatively one can populate the input as sees fit.
# Improving accuracy means increasing the re-sample size
# as well as the number of bootstrap iterations (R and M)
#
# Bootstrap is a non-parametric method to estimate sample
# statistics (unlike a t-test for example which ASSUMES that our
# input data comes from a Gaussian distribution) and it is roughly based
# on re-sampling our data many times (this is where the computer comes
# in and why this method is popular today) and each time
# calculating the statistics (mean, stdev, etc).
# Q: What is a confidence interval (CI)?
# Without getting into too much detail, a CI (as a range of sample values)
# of Y% is a property
# of our sample which says that if you draw random items from
# your sample, their value will be within the CI, Y% of the times.
# See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp
# more info: http://www.stat.wmich.edu/s160/book/node48.html
#
# author: bliako
# date: 03/05/2018
####
my $time_started = time;
# RNG seed or use time_started
my $seed = 1234; # $time_started;
# size of my input sample / data
my $N = 10000;
# number of bootstrap repeats (re-samplings)
my $M = 1000;
# number of re-sample size set as the same as input size
my $R = $N;
print "$0 : starting, N=$N, M=$M, R=$R\n";
# confidence intervals to calculate:
my $confidence_intervals_levels = [5/100, 95/100];
# ad-hoc mean and stdev of our input data
my $dist_mean = pdl(10.35);
# increasing stdev will expand CI
my $dist_stdev = pdl(1.231);
# produce the Random Number Generator to generate random data
# to populate our input sample (just for demo):
my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed);
# produce a random sample from a Gaussian (Normal) distribution
# with specified stdev and mean:
my $X = zeroes($N);
$rng->ran_gaussian($dist_stdev, $X);
$X += $dist_mean;
my $input_data_statistics = [
PDL::Ufunc::avg($X),
PDL::Stats::Basic::stdv($X)
];
print "input data statistics:"
."\n\tmean: ".$input_data_statistics->[0]
."\n\tstdev: ".$input_data_statistics->[1]
."\n";
# Now we will re-sample our input $M times and each time
# we will record the statistic of interest,
# e.g. for demo this will be MEAN (=average)
# arrays to store the results of each iteration wrt mean and stdev:
my $means = zeroes($M);
my $stdevs = zeroes($M);
for(my $i=0;$i<$M;$i++){
# get a re-sample from original data with size R
# use our RNG to do the re-sampling:
my $asample = resample($X, $rng, $R);
$means->set($i, PDL::Ufunc::avg($asample));
$stdevs->set($i, PDL::Stats::Basic::stdv($asample));
}
my $estimated_statistics = [
PDL::Ufunc::avg($means),
PDL::Ufunc::avg($stdevs)
];
my $discrepancies = [
100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0],
100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1]
];
print "estimated statistics by bootstrap (after $M re-samplings):"
."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)"
."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)"
."\n";
# now sort the means vector and pick the elements at
# the confidence intervals specified, e.g. 5%
my $sorted_means = PDL::Ufunc::qsort($means);
my $confidence_intervals_values = [
$sorted_means->at(int($confidence_intervals_levels->[0]*$M)),
$sorted_means->at(int($confidence_intervals_levels->[1]*$M))
];
print "confidence intervals after bootstraping $M times with a sample size of $N each time:\n"
."\t".$confidence_intervals_levels->[0]." => ".$confidence_intervals_values->[0]."\n"
."\t".$confidence_intervals_levels->[1]." => ".$confidence_intervals_values->[1]."\n"
;
print "done in ".(time-$time_started)." seconds.\n";
# get a random sample with replacement from input vector
# 2nd param is a RNG object which will select N random
# elements (with replacement) from input vector
# 3rd param is OPTIONAL: the size of the sample,
# default is the size of the input vector
sub resample {
my $original_sample = $_[0];
my $arng = $_[1];
my $No = $_[2] || $original_sample->getdim(0);
my $indices = ($No * $rng->get_uniform($No))->floor();
my $newsample = $original_sample->slice($indices);
return $newsample;
}
####
# get a random sample with replacement from input vector
# 2nd param is a RNG object which will select N random
# elements (with replacement) from input vector
# 3rd param is OPTIONAL: the size of the sample,
# default is the size of the input vector
sub resample {
my $original_sample = $_[0];
my $arng = $_[1];
my $No = $_[2] || $original_sample->getdim(0);
my $indices = ($No * $rng->get_uniform($No))->floor();
my $newsample = $original_sample->slice($indices);
return $newsample;
}
my $M = 1000; # num bootstrap resamplings
my $X = ... ; # input data piddle (1D)
my $R = $X->getdim(0); # size of bootstrap resamples
my $rng = PDL::GSL::RNG->new('taus')->set_seed($seed);
my $means = zeroes($M);
for(my $i=0;$i<$M;$i++){
# get a re-sample from original data with size R
# use our RNG to do the re-sampling:
my $asample = resample($X, $rng, $R);
$means->set($i, PDL::Ufunc::avg($asample));
}
# now sort the means vector and pick the elements at
# the confidence intervals specified, e.g. 5%
my $sorted_means = PDL::Ufunc::qsort($means);
my $confidence_intervals_values = [
$sorted_means->at(int(0.05*$M)),
$sorted_means->at(int(0.95*$M))
];
##
##
#!/usr/bin/env perl
use strict;
use warnings;
use threads;
use threads::shared;
use PDL;
use PDL::Stats::Basic;
use PDL::Ufunc;
use PDL::Parallel::threads;
use Thread::Queue;
# MT random number generator for thread-safe operation
# not just Math::Random::MT (which gave me problems)
# but Math::Random::MT::Auto (btw neither Math::Random::MTwist/OO works - maybe my fault?)
use Math::Random::MT::Auto;
use Getopt::Long;
####
# program to get an input data vector (1D)
# and calculate its statistics (mean, stdev)
# and confidence intervals using Bootstrap.
# For demonstration the input data is generated randomly
# with specified mean/stdev from Gaussian distribution.
# Alternatively one can populate the input as sees fit.
# Improving accuracy means increasing the re-sample size
# as well as the number of bootstrap iterations (R and M)
#
# Bootstrap is a non-parametric method to estimate sample
# statistics (unlike a t-test for example which ASSUMES that our
# input data comes from a Gaussian distribution) and it is roughly based
# on re-sampling our data many times (this is where the computer comes
# in and why this method is popular today) and each time
# calculating the statistics (mean, stdev, etc).
# Q: What is a confidence interval (CI)?
# Without getting into too much detail, a CI (as a range of sample values)
# of Y% is a property
# of our sample which says that if you draw random items from
# your sample, their value will be within the CI, Y% of the times.
# See also examples here: https://www.investopedia.com/terms/c/confidenceinterval.asp
# more info: http://www.stat.wmich.edu/s160/book/node48.html
#
# author: bliako
# date: 03/05/2018
####
my $input_filename = undef;
# default confidence intervals to calculate:
my @confidence_intervals_levels = ();
my $num_bootstrap_repeats = 1000;
my $bootstrap_sample_size = undef;
my $max_num_threads = 3;
my @demo_params;
if( ! Getopt::Long::GetOptions(
'input|i=s' => \$input_filename,
'confidence-intervals|c=f{2}' => \@confidence_intervals_levels,
'num-bootstrap-repeats|R=i' => \$num_bootstrap_repeats,
'bootstrap-sample-size|S=i' => \$bootstrap_sample_size,
'max-num-threads=i' => \$max_num_threads,
'demo=f{3}' => \@demo_params,
'help' => sub { print usage($0); exit(0) }
) ){ print STDERR usage($0) . "\n\n$0 : something wrong with command line parameters.\n"; exit(1) }
my $time_started = time;
if( scalar(@confidence_intervals_levels) == 0 ){ @confidence_intervals_levels = (5/100, 95/100) }
my ($i, $N, $line);
# my input data piddle
my $X;
# produce the Random Number Generator to generate random data
# to populate our input sample (just for demo):
my $rngMT = Math::Random::MT::Auto->new();
if( ! defined($rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed.\n"; exit(1) }
if( defined($input_filename) ){
# read data from file:
if( ! open(INP, '<', $input_filename) ){ print STDERR "$0 : could not open input file '$input_filename', $!\n"; exit(1) }
$i = 0;
while( $line = ){
chomp($line);
if( ! ($line =~ /^\s*$/) ){ $i++; }
}
seek(INP, 0, 0); # rewind filehandle
$i = 0;
while( $line = ){
chomp($line);
if( ! ($line =~ /^\s*$/) ){ $X->set($i++, $line) }
}
close(INP);
} elsif( scalar(@demo_params) == 3 ){
# produce a random sample from a Gaussian (Normal) distribution
# with specified stdev and mean for demo
$N = $demo_params[2];
print "$0 : creating random data of $N items with mean = ".$demo_params[0]." and stdev = ".$demo_params[1]." ...\n";
$X = zeroes($N);
# populates above piddle with rands from gaussian dist with specified mean/stdev
# using the RNG obj provided
gaussian_rand({
'mean' => $demo_params[0],
'stdev' => $demo_params[1],
'piddle' => $X,
'rng' => $rngMT
});
} else {
print STDERR usage($0) . "\n$0 : either an input file must be specified (--input) or use the demo switch as : --demo MEAN STDEV SAMPLE_SIZE\n";
exit(1)
}
# size of input data
$N = $X->getdim(0);
$bootstrap_sample_size = $N unless defined($bootstrap_sample_size);
print "$0 : starting with\n\tinput sample size: $N\n\tbootstrap repeats: $num_bootstrap_repeats\n\tbootstrap sample size: $bootstrap_sample_size\n\tparallelising over $max_num_threads threads.\n";
my $input_data_statistics = [
PDL::Ufunc::avg($X),
PDL::Stats::Basic::stdv($X)
];
print "$0 : input data statistics:"
."\n\tmean: ".$input_data_statistics->[0]
."\n\tstdev: ".$input_data_statistics->[1]
."\n";
# Now we will re-sample our input $bootstrap_sample_size times and each time
# we will record the statistic of interest,
# e.g. for demo this will be MEAN (=average)
# arrays to store the results of each iteration wrt mean and stdev:
# size will be equal to number of repeats
my $means = zeroes($num_bootstrap_repeats);
my $stdevs = zeroes($num_bootstrap_repeats);
# we need to share these
$X->share_as('X');
$means->share_as('means');
$stdevs->share_as('stdevs');
my $Q = Thread::Queue->new();
sub worker {
my $tid = threads->tid;
my $worker_rngMT = Math::Random::MT::Auto->new();
if( ! defined($worker_rngMT) ){ print STDERR "$0 : call to ".'Math::Random::MT::Auto->new()'." has failed for worker $tid.\n"; exit(1) }
while( defined(my $item = $Q->dequeue())){
my $shallow_copy_of_X = PDL::Parallel::threads::retrieve_pdls('X');
my $shallow_copy_of_means = PDL::Parallel::threads::retrieve_pdls('means');
my $shallow_copy_of_stdevs = PDL::Parallel::threads::retrieve_pdls('stdevs');
my $asample = resample(
$shallow_copy_of_X,
$worker_rngMT,
$bootstrap_sample_size
);
my $amean = PDL::Ufunc::avg($asample);
my $astdev = PDL::Stats::Basic::stdv($asample);
$shallow_copy_of_means->set($item, $amean);
$shallow_copy_of_stdevs->set($item, $astdev);
#print "I am worker $tid and getting index=$item : $amean, $astdev\n";
}
print "worker $tid is ending\n";
} # worker sub
my @tpool = map{
threads->create(\&worker, $Q)
} 1 .. $max_num_threads;
for(my $i=0;$i<$num_bootstrap_repeats;$i++){
$Q->enqueue($i);
}
$Q->end();
$_->join for @tpool;
#exit(0);
my $estimated_statistics = [
PDL::Ufunc::avg($means),
PDL::Ufunc::avg($stdevs)
];
=pod
my $discrepancies = [
100 * abs($estimated_statistics->[0] - $input_data_statistics->[0])/$input_data_statistics->[0],
100 * abs($estimated_statistics->[1] - $input_data_statistics->[1])/$input_data_statistics->[1]
];
print "estimated statistics by bootstrap (after $M re-samplings):"
."\n\tmean: ".$estimated_statistics->[0]." (discrepancy: ".$discrepancies->[0]." %)"
."\n\tstdev: ".$estimated_statistics->[1]." (discrepancy: ".$discrepancies->[1]." %)"
."\n";
=cut
# now sort the means vector and pick the elements at
# the confidence intervals specified, e.g. 5%
my $sorted_means = PDL::Ufunc::qsort($means);
my @confidence_intervals_values = (
$sorted_means->at(int($confidence_intervals_levels[0]*$num_bootstrap_repeats)),
$sorted_means->at(int($confidence_intervals_levels[1]*$num_bootstrap_repeats))
);
print "confidence intervals after bootstraping $num_bootstrap_repeats times with a sample size of $N each time:\n"
."\t".$confidence_intervals_levels[0]." => ".$confidence_intervals_values[0]."\n"
."\t".$confidence_intervals_levels[1]." => ".$confidence_intervals_values[1]."\n"
;
print "done in ".(time-$time_started)." seconds.\n";
exit(0);
sub usage {
return "Usage : ".$_[0]." --input file.txt | --demo MEAN STDEV SAMPLE_SIZE [--confidence-intervals LEFT RIGHT (e.g. 0.05 0.95 for 5% and 95%)] [--num-bootstrap-repeats N] [--bootstrap-sample-size S (if not specified it will be the same as input sample size)] [--max-num-threads P]\nExample:\n".$_[0]." --demo 10.32 1.23 1000 --max-num-threads 4 --confidence-intervals 0.075 0.925\n"
}
# get a random sample with replacement from input vector
# 2nd param is a RNG object which will select N random
# elements (with replacement) from input vector
# 3rd param is OPTIONAL: the size of the sample,
# default is the size of the input vector
sub resample {
my $original_sample = $_[0];
my $arng = $_[1]; # RNG to give us random indices to sample
my $No = $_[2] || $original_sample->getdim(0); # size of sample
my $newsample = zeroes($No);
for(my $i=$No;$i-->0;){ $newsample->set($i, $original_sample->at(floor($No * $arng->rand()))) }
return $newsample;
}
# returns n random numbers from the gaussian (normal) distribution
# with mean and stdev optionally specified. Default values are mean=0, stdev=1
# An rng object which exports rand([$n]) is required
sub gaussian_rand {
my $params = $_[0];
my $rngobj;
if( ! defined($rngobj=$params->{'rng'}) ){
# now this is weird:
# my $rngobj = $params->{'rng'} or die ... fails once in a while!
die "rng is required, params were: ".join(",",keys %$params)." and rng was ".$params->{'rng'}."\n";
}
my $piddle_to_return_results = $params->{'piddle'};
my $array_to_return_results = $params->{'array'};
my $n = undef;
if( defined($piddle_to_return_results) ){
$n = $piddle_to_return_results->getdim(0);
} else {
if( defined($array_to_return_results) ){
$n = scalar(@$array_to_return_results);
} else {
$n = $params->{'n'} or die "n is required if no array or piddle references are specified.";
$array_to_return_results = [ (0)x$n ];
}
}
my ($mean, $stdev);
if( ! defined($mean=$params->{'mean'}) ){ $mean = 0.0 }
if( ! defined($stdev=$params->{'stdev'}) ){ $stdev = 1.0 }
# following code is from "Perl Cookbook" (http://www.oreilly.com/catalog/perlckbk2/)
my ($u1, $u2); # uniformly distributed random numbers
my $w; # variance, then a weight
my ($g1, $g2); # gaussian-distributed numbers
my $i;
for($i=0;$i<($n-1);$i+=2){
do {
$u1 = 2 * $rngobj->rand() - 1;
$u2 = 2 * $rngobj->rand() - 1;
$w = $u1*$u1 + $u2*$u2;
} while ( $w >= 1 );
$w = sqrt( (-2 * log($w)) / $w );
$g2 = ($u1 * $w) * $stdev + $mean;
$g1 = ($u2 * $w) * $stdev + $mean;
if( defined($array_to_return_results) ){
$array_to_return_results->[$i] = $g1;
$array_to_return_results->[$i+1] = $g2;
} else {
$piddle_to_return_results->set($i, $g1);
$piddle_to_return_results->set($i+1, $g2);
}
}
if( $n % 2 == 1 ){
$i = $n-1;
do {
$u1 = 2 * $rngobj->rand() - 1;
$u2 = 2 * $rngobj->rand() - 1;
$w = $u1*$u1 + $u2*$u2;
} while ( $w >= 1 );
$w = sqrt( (-2 * log($w)) / $w );
$g2 = ($u1 * $w) * $stdev + $mean;
$g1 = ($u2 * $w) * $stdev + $mean;
if( defined($array_to_return_results) ){
$array_to_return_results->[$i] = $g1;
} else {
$piddle_to_return_results->set($i, $g1);
}
}
return $array_to_return_results;
}