=head1 NAME
Statistics::SGT - Simple Good-Turing Frequency Estimator
=head1 SYNOPSIS
use Statistics::SGT;
=head1 DESCRIPTION
The function C<Statistics::SGT::sgt> takes a set of
I<S<( int:frequency, int:frequency_of_frequency )>> pairs, and
applies the Simple Good-Turing technique for estimating
the probabilities corresponding to the observed frequencies,
and P.0, the joint probability of all unobserved species.
The input should consist of a list of array-refs, each of which
contains two positive integers: an observed frequency, and the
frequency of that frequency.
No checks are made for linearity; the routine simply assumes that the
requirements for using the SGT estimator are met.
The output is a list of I<S<( int:frequency, [0,1.0]:estimated_probabi
+lity )>> pairs.
The set of frequencies is the same as in the input array,
plus the addition of a I<S<( 0, P.0 )>> element.
=head1 REFERENCES
This is a Perl adaptation of a C program originally written by:
B<Geoffrey Sampson,
School of Cognitive and Computing Sciences,
University of Sussex (England),
1995-06-27.>
The Simple Good-Turing technique was devised by William A. Gale of AT&
+T Bell Labs.
It is described in:
B<Gale, William A., and Geoffrey Sampson,
'Good-Turing Frequency Estimation Without Tears',
I<Journal of Quantitative Linguistics> 2.217-37, 1995.>
See also: http://www.grsampson.net/RGoodTur.html
=cut
package Statistics::SGT;
use Exporter;
use strict;
use vars qw( @ISA @EXPORT_OK $VERSION $MIN_INPUT $EPSILON );
@ISA = qw(Exporter);
@EXPORT_OK = qw( &sgt );
$VERSION = '0.01';
$MIN_INPUT = 5;
$EPSILON = 0.00000001; # use for n, in place of 0, to prevent divide-b
+y-zero.
{
package SGT_::Smoother;
sub slope { $_[0]{'slope'} }
sub intercept { $_[0]{'intercept'} }
sub smoothed { # double (int)
my $self = shift;
my $i = shift;
exp( $self->intercept + $self->slope * log($i) );
}
sub new {
my $pkg = shift;
my $X_name = shift;
my $Y_name = shift;
my $ar = shift; # array ref (\@recs)
ref($ar) && $ar =~ /\bARRAY\b/ or die "SGT_::Smoother::new($ar) --
+ arg not an arrayref!";
@$ar > 0 or die "SGT_::Smoother::new($ar) -- array passed is empty
+!";
my $meanX = 0.0;
my $meanY = 0.0;
for ( @$ar ) {
$meanX += $_->{ $X_name };
$meanY += $_->{ $Y_name };
}
$meanX /= scalar(@$ar);
$meanY /= scalar(@$ar);
my $XYs = 0.0;
my $Xsquares = 0.0;
for ( @$ar ) {
$XYs += ($_->{ $X_name } - $meanX) * ($_->{ $Y_name } - $meanY);
$Xsquares += square($_->{ $X_name } - $meanX);
}
$Xsquares != 0 or die "SGT_::Smoother::new: \$Xsquares is zero!";
my $slope = $XYs / $Xsquares;
bless {
'slope' => $slope,
'intercept' => $meanY - $slope * $meanX,
}, $pkg;
}
sub square($) { $_[0] ** 2.0 }
}
=head1 FUNCTIONS
=head2 sgt()
The input consists of a set of pairs of numbers ( r, n ).
Each pair can be in the form of an array, in which case the [0] elemen
+t is
taken for r, and the [1] element is taken for n.
Alternatively, it can take the form of a hash, in which case it must
contain an {'r'} element and an {'n'} element, which are used directly
+.
If called in a list context, this function returns a list of 'records'
+,
each of which is a hash, described below. The array is ordered by
the [0] (r) element of each record.
If called in a scalar context, this function returns the records in
a hash, which is returned by reference. This hash is keyed by the 'r'
element of each record.
If called in a void context, nothing is returned; this may be useful i
+f the
package variable $SET_OUTVALS is set to true prior to calling this fun
+ction.
If that variable is true, the values of the input array are modified i
+n the
following way: if the value is an arrayref (r in [0] and n in [1]), t
+hen
the derived 'p' value is placed in position [2]; if the value is a has
+href,
then an element 'p' is added to the hash.
$SET_OUTVALS is false by default: the input array is not modified in a
+ny way.
The resulting records are hashrefs, each with the following elements:
r
n
p -- final derived result for this r
The records also contain some derived values only used internally to a
+lgorithm:
log_r
log_Z
rStar
j -- the record's index in the record array.
This function throws an exception on any detected error.
=cut
#
# in_* *_AofA => [ [ ] ]
# out_* *_AofH => [ { } ]
# inout_* *_HofA => [ [ ] ]
# *_HofH => { { } }
#
# rtn => ( AofA | AofH | HofA | HofH )
#
sub sgt {
my %args = @_;
my( $in_key, $out_key, $rtn_type ) = do {
my @in_keys = grep { /^(-?)in/i } keys %args;
my @out_keys = grep { /out_/i } keys %args;
my @rtn_keys = grep { /^(-?)rtn$/i } keys %args;
my $rrrr = @in_keys ? 'many' : 'few';
@in_keys == 1 or die "Too $rrrr IN ARG specifiers! (must be exact
+ly 1)";
@out_keys > 1 and die "Too many OUT ARG specifiers! (must be at mo
+st 1)";
@rtn_keys > 1 and die "Too many RETURN specifiers! (must be at mo
+st 1)";
( $in_keys[0], $out_keys[0], @rtn_keys ? $args{$rtn_keys[0]} : und
+ef );
};
my $in_arg_ref = $args{$in_key};
my $out_arg_ref = defined $out_key ? $args{$out_key} : undef ; # may
+ be undef
# $rtn_type is a string of the form "AofH".
ref($in_arg_ref) or die "IN ARG value must be either a array ref or
+an hash ref!";
if ( $out_arg_ref ) {
ref($out_arg_ref) or die "OUT ARG value must be either a array ref
+ or an hash ref!";
}
# returns a vector of two single-character strings, e.g. ( 'a', 'h'
+).
my $extype = sub {
my $kt = shift;
defined $kt or return(' ',' '); # undef: no key.
lc($kt) =~ /(?:^|_)([ah])of?([ah])$/ or die "Arg key type '$kt' do
+esn't end with _[AH]of[AH] !!!";
( $1, $2 )
};
my @in_arg_type = $extype->( $in_key );
my @out_arg_type = $extype->( $out_key );
my @rtn_type = $extype->( $rtn_type );
my $in_arg_type = join '', @in_arg_type;
my $out_arg_type = join '', @out_arg_type;
$rtn_type = join '', @rtn_type;
#
# barf if the stupid user does something stupid like
# in_HofA => \@foo
# or
# inout_AofH => \%foo
#
my %ul = qw( a ARRAY h HASH );
$in_arg_ref =~ /\b$ul{$in_arg_type[0]}\b/ or die "Actual IN ARG valu
+e does not match specified type!
($ul{$in_arg_type[0]} specified, $in_arg_ref passed.";
if ( $out_arg_ref ) {
$out_arg_ref =~ /\b$ul{$out_arg_type[0]}\b/ or die "Actual OUT ARG
+ value does not match specified type!
($ul{$out_arg_type[0]} specified, $out_arg_ref passed.";
}
#
# r and n are both supposed to be non-zero positive integers ("natur
+al numbers").
#
my %get_elem_rn = (
'a' => sub {
my $r = shift;
@$r >= 2 or die "Too few elements in input element array";
@$r;
},
'h' => sub {
my $rec_hr = shift;
my $default_r = shift;
my $r = $rec_hr->{'r'};
exists $rec_hr->{'n'} or die "Missing 'n' field in input eleme
+nt hash";
if ( defined $r and defined $default_r ) {
$r == $default_r or
die "'r' value in element ($r) != key of input hash ($defa
+ult_r) !!!";
}
unless ( defined $r ) { # what to do if 'r' not present
if ( defined $default_r ) {
$r = $default_r;
}
else {
die "Missing 'r' field in input element hash";
}
}
( $r, $rec_hr->{'n'} );
},
);
#
# our working data structures.
# %recs is the main thing; it is keyed by 'r' value.
# @recs is essentially a perlish kludge for a doubly-linked list;
# these are the only operations it needs to support:
# prev/next; is_first/is_last;
# the elements of @recs are ordered by 'r' value.
#
my %recs;
my @recs;
#
# if input is AofA, (r,n) is taken from [0,1]; both must be defined.
# if input is AofH, (r,n) is taken from {'r','n'}; both must be defi
+ned.
# if input is HofA, (r,n) is taken from [0,1], and [0] must equal th
+e hash key.
# if input is HofH, (r,n) is taken from {'r','n'}, according to the
+following rules:
# {'r'} may be missing, in which case (r) is taken from the hash
+key;
# if {'r'} is present, it must equal the hash key.
#
{
my @keys = $in_arg_type[0] eq 'a' ? () : keys %$in_arg_ref;
my @in_recs = $in_arg_type[0] eq 'a' ? @$in_arg_ref : @{$in_arg_ref}
+{@keys};
for my $i ( 0 .. $#in_recs ) {
my $el = $in_recs[$i];
ref($el) or die "Input element not a reference! ($el)";
$el =~ /\b$ul{$in_arg_type[1]}\b/ or die "Input element ref does n
+ot match specified type!
($ul{$in_arg_type[1]} specified, $in_arg_ref passed.";
my( $r, $n ) = $get_elem_rn{$in_arg_type[1]}->($el, $keys[$i]); #
+default r
$recs{$r} = {
'r' => $r,
'n' => $n || $EPSILON,
'log_r' => log($r),
};
}
}
#
# %recs has now been initialized from the input data.
#
exists $recs{1} or
die "Error: no value '1' in input!";
#@recs = sort { $a->{'r'} <=> $b->{'r'} } values %recs;
@recs = @recs{ sort { $a <=> $b } keys %recs };
@recs >= $MIN_INPUT or
die "Not enough input values (\$Statistics\::SGT\::MIN_INPUT = $MI
+N_INPUT)";
for my $j ( 0 .. $#recs ) {
$recs[$j]{'j'} = $j; # let records find themselves in the @recs a
+rray.
}
#
# now we're done initializing our working structures (@recs, %recs)
+from the input data;
# commence to analyse the data:
#
for ( @recs ) {
my $j = $_->{'j'};
my $i = (
$j == 0 # first row?
? 0
: $recs[$j - 1]{'r'} # 'r' of previous row
);
my $k = (
$j == $#recs # last row?
? (2 * $_->{'r'} - $i) # use the same delta as on the last st
+ep.
: $recs[$j + 1]{'r'} # 'r' of next row
);
($k - $i) or die "Statistics::SGT::sgt(): divide by zero immanent!
+";
$_->{'log_Z'} = log( 2 * $_->{'n'} / ($k - $i) );
}
my $smoother = new SGT_::Smoother 'log_r', 'log_Z', \@recs;
my $indiffValsSeen = 0; # false
for ( @recs ) {
$smoother->smoothed($_->{'r'})
or die "Statistics::SGT::sgt(): divide by zero immanent!";
my $y = ($_->{'r'} + 1) * $smoother->smoothed($_->{'r'} + 1) / $sm
+oother->smoothed($_->{'r'});
exists $recs{$_->{'r'} + 1} or
$indiffValsSeen = 1; # true
unless ( $indiffValsSeen ) {
my $next_n = $recs{$_->{'r'} + 1}{'n'};
my $x = ($_->{'r'} + 1) * $next_n / $_->{'n'}; # n=0 elements ha
+ve been removed from input.
if (
abs($x - $y)
<=
1.96 * sqrt(
SGT_::Smoother::square($_->{'r'} + 1.0) * $next_n
/
SGT_::Smoother::square($_->{'n'} ) * (1 + $next_n / $
+_->{'n'})
)
) {
$indiffValsSeen = 1; # true
}
else {
$_->{'rStar'} = $x;
}
}
$indiffValsSeen and
$_->{'rStar'} = $y;
}
my $bigN = 0; # int
for ( @recs ) {
$bigN += $_->{'r'} * $_->{'n'};
}
$bigN or die "divide-by-zero immanent! (bigN)";
my $P_0 = $recs{1}{'n'} / $bigN; # double
my $bigNprime = 0.0; # double
for ( @recs ) {
$bigNprime += $_->{'rStar'} * $_->{'n'};
}
$bigNprime or die "divide-by-zero immanent! (bigNprime)";
for ( @recs ) {
$_->{'p'} = (1 - $P_0) * $_->{'rStar'} / $bigNprime;
}
#
# P_0 gets added to the output list, in position [0] and {0}.
# but first we check that there's nothing there already.
#
exists $recs{0} and die "Already a P.0 element in the working data s
+et!";
$recs{0} = { 'r' => 0, 'n' => 0, 'p' => $P_0 };
unshift @recs, $recs{0};
#
# Now process the out param and/or return requests.
#
#
# we already have AofH and HofH, suitable for returning.
#
# note that the default case is 'ah'... but ' ' also falls into it!
# thus effectively making 'ah' the default return type.
if ( $rtn_type eq 'ha' ) {
return(
wantarray
? ( map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs )
: { map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs }
);
}
elsif ( $rtn_type eq 'aa' ) {
return(
wantarray
? ( map { [ @{$_}{qw(r n p)} ] } @recs )
: [ map { [ @{$_}{qw(r n p)} ] } @recs ]
);
}
elsif ( $rtn_type eq 'hh' ) {
return( wantarray ? %recs : \%recs );
}
else { # 'ah' and ' ' fall into this case:
return( wantarray ? @recs : \@recs );
}
}
=head1 AUTHOR, DATE, COPYRIGHT
Perl code by JDPORTER@cpan.org (John Porter), 2000-01-31.
This Perl module is free software, and may be modified and/or
redistributed under the same terms as Perl itself.
=cut
1;
|