#! perl -slw use strict; use List::Util qw[ sum ]; use Math::Random::MT qw[ rand ]; use Data::Dump qw[ pp ]; $Data::Dump::MAX_WIDTH = 1000; use constant NO_OF_DOCS => 10000; use constant DOC_PER_WORD => NO_OF_DOCS / 2; use constant OCCURANCES_PER_DOC => 20; ## Build some test data my @words = qw[ the quick brown fox jumps over a lazy dog twice ]; my %index = map{ $_ => {} } @words; for my $word ( keys %index ) { for my $doc ( map{ int rand NO_OF_DOCS } 1 .. DOC_PER_WORD ) { my $docName = sprintf "Doc%05d", $doc; $index{ $word }{ $docName } = [ map{ int rand 2**32 } 1 .. OCCURANCES_PER_DOC ] } } ##pp \%index; ## Building the posting lists in ASCII my @postings = map { my $wordHash = $_; join ' ', map{ "$_ " . join ':', @{ $wordHash->{ $_ } }; } keys %{ $wordHash }; } values %index; ## pp \@postings; printf "ASCII uses %d bytes\n", sum map length, @postings; ## Doing them in binary my @binPostings = map { my $wordHash = $_; pack '(N/A*)*', map { pack 'C/A* A*', $_, pack 'N*', @{ $wordHash->{ $_ } }; } keys %{ $wordHash }; } values %index; printf "Binary uses %d bytes\n", sum map length, @binPostings; #pp \@binPostings; <>; ## Unpack the packed binary my $wordIndex = 0; for my $wordData ( @binPostings ) { print $words[ $wordIndex ++ ]; for my $docData ( unpack '(N/A*)*', $wordData ) { my( $doc, $posns ) = unpack 'C/A* A*', $docData; print "\t$doc: ", join ', ', unpack 'N*', $posns; } } __END__ ## Output for 10 words appearing 20 times in each of ## half of 10,000 4GB documents at random positions. C:\test>678848 ASCII uses 8,801,314 bytes Binary uses 3,656,853 bytes