$ cat analyze_bloom.pl #!/usr/bin/perl # # analyze_bloom.pl # # Experiment with Bloom filter variant by BUk # use strict; use warnings; use autodie; use Bit::Vector; use Math::BigInt; my $VecSizeBits=shift or die "Missing # of bits!"; my $num_vecs =shift or die "missing # of vectors!"; my $iterations =shift or die "missing iteration count!"; if ($iterations<33) { $iterations = 1<<$iterations; } my $VecSize = 1 << $VecSizeBits; my $VecMask = $VecSize-1; print "NumVecs=$num_vecs\n"; my @col2numbits = (0, 1); while (@col2numbits < 1<<$num_vecs) { @col2numbits = (@col2numbits, map { $_+1 } @col2numbits); } my @cnt_colbits = (0) x (1+$num_vecs); my %report_cnts = map { (1<<$_)=>0 } (8 .. 24); my @vectors; my @cnt_bits = (0) x $num_vecs; $vectors[$_] = Bit::Vector::new(0,$VecSize) for 0 .. $num_vecs-1; { my @HDR = ( q{ avg avg calc }, q{BI Samples one bits pct 1s pct 1s }, q{-- ---------- -------- ------ ------ }, ); my $t = int(($num_vecs+1) * 8 - length(" # collisions "))/2; $HDR[0] .= "-"x$t . " # collisions " . "-"x$t; $HDR[1] .= " all "; $HDR[1] .= sprintf(" % 2u ", $_) for 1 .. $num_vecs; $HDR[1] .= " p(coll)"; $HDR[2] .= ("-"x7 . " ") x ($num_vecs+1) . "------------"; print join("\n", @HDR, ""); } my $first_coll=0; my $cnt_coll=0; my $first_coll_f=0; my $cnt_coll_f=0; my $cnt_rex=0; while (++$cnt_rex < $iterations) { my @h = map { int(rand($VecSize)) } 0 .. $VecMask; my $bits=32; my $collision=0; for my $i (0 .. $num_vecs-1) { if ($vectors[$i]->bit_test($h[$i])) { $collision |= 1<<$i } else { ++$cnt_bits[$i] } $vectors[$i]->Bit_On($h[$i]); } if ($collision) { ++$cnt_colbits[$col2numbits[$collision]]; ++$cnt_colbits[0]; # 'any' collision ++$cnt_coll; $first_coll=$cnt_rex if !$first_coll; if ($collision == $VecMask) { ++$cnt_coll_f; $first_coll_f=$cnt_rex if !$first_coll_f; } } if (exists $report_cnts{$cnt_rex}) { my $ttl_bits=0; $ttl_bits += $_ for @cnt_bits; my $avg_bits = $ttl_bits / $num_vecs; my $avg_pct_full = 100.0 * $ttl_bits / ($num_vecs*$VecSize); my $calc_pct_full = 100.0 * (1-exp(-$cnt_rex/$VecSize)); printf "% 2u %10u: % 7u %6.2f %6.2f", $VecSizeBits, $cnt_rex, $avg_bits, $avg_pct_full, $calc_pct_full; print " ", join(" ", map {sprintf"% 7u",$_} @cnt_colbits ); my $expected = (1-exp(-$cnt_rex/$VecSize))**$num_vecs; printf " %.10f", $expected; print "\n"; } } $ ./analyze_bloom.pl 10 3 13 NumVecs=3 avg avg calc --------- # collisions --------- BI Samples one bits pct 1s pct 1s all 1 2 3 p(coll) -- ---------- -------- ------ ------ ------- ------- ------- ------- ------------ 10 256: 222 21.74 22.12 86 72 14 0 0.0108230772 10 512: 405 39.62 39.35 250 186 59 5 0.0609161842 10 1024: 647 63.22 63.21 699 351 265 83 0.2525804578 10 2048: 892 87.11 86.47 1711 494 677 540 0.6464623148 10 4096: 1012 98.83 98.17 3758 511 1000 2247 0.9460533270 $ ./analyze_bloom.pl 12 5 15 NumVecs=5 avg avg calc ----------------- # collisions ----------------- BI Samples one bits pct 1s pct 1s all 1 2 3 4 5 p(coll) -- ---------- -------- ------ ------ ------- ------- ------- ------- ------- ------- ------------ 12 256: 248 6.06 6.06 37 36 1 0 0 0 0.0000008164 12 512: 479 11.70 11.75 138 114 23 1 0 0 0.0000223999 12 1024: 908 22.17 22.12 434 311 103 17 3 0 0.0005295634 12 2048: 1604 39.17 39.35 1296 634 447 170 44 1 0.0094309292 12 4096: 2572 62.81 63.21 3282 915 1038 788 443 98 0.1009251903 12 8192: 3542 86.47 86.47 7372 979 1392 1775 1968 1258 0.4833243641 12 16384: 4020 98.15 98.17 15564 980 1420 2052 3718 7394 0.9117155503