-x/N
d = 1 - e
####
-x/N h
p(hit) = (1 - e )
##
##
$cnt_rex x The sample number
$VecSizeBits N Number of bits in the vector
$num_vecs h Number of vectors/hash functions
@vectors Content of vectors
@cnt_bits Number of bits set to 1 in vector
@cnt_colbits Number of collisions: I've broken it out to
[0] = Total document collisions
[1] = Count of documents colliding with 1 vector
[2] = Count of docs colliding with 2 vectors
...
$first_coll Index of first collision...useless, should remove
$first_coll_f Index of first false positive (all vectors collide)
##
##
$ cat analyze_bloom.pl
#!/usr/bin/perl
#
# analyze_bloom.pl
#
# Experiment with Bloom filter variant by BUk
#
use strict;
use warnings;
use autodie;
use Bit::Vector;
use Math::BigInt;
my $VecSizeBits=shift or die "Missing # of bits!";
my $num_vecs =shift or die "missing # of vectors!";
my $iterations =shift or die "missing iteration count!";
if ($iterations<33) {
$iterations = 1<<$iterations;
}
my $VecSize = 1 << $VecSizeBits;
my $VecMask = $VecSize-1;
print "NumVecs=$num_vecs\n";
my @col2numbits = (0, 1);
while (@col2numbits < 1<<$num_vecs) {
@col2numbits = (@col2numbits, map { $_+1 } @col2numbits);
}
my @cnt_colbits = (0) x (1+$num_vecs);
my %report_cnts = map { (1<<$_)=>0 } (8 .. 24);
my @vectors;
my @cnt_bits = (0) x $num_vecs;
$vectors[$_] = Bit::Vector::new(0,$VecSize) for 0 .. $num_vecs-1;
{
my @HDR = (
q{ avg avg calc },
q{BI Samples one bits pct 1s pct 1s },
q{-- ---------- -------- ------ ------ },
);
my $t = int(($num_vecs+1) * 8 - length(" # collisions "))/2;
$HDR[0] .= "-"x$t . " # collisions " . "-"x$t;
$HDR[1] .= " all ";
$HDR[1] .= sprintf(" % 2u ", $_) for 1 .. $num_vecs;
$HDR[1] .= " p(coll)";
$HDR[2] .= ("-"x7 . " ") x ($num_vecs+1) . "------------";
print join("\n", @HDR, "");
}
my $first_coll=0;
my $cnt_coll=0;
my $first_coll_f=0;
my $cnt_coll_f=0;
my $cnt_rex=0;
while (++$cnt_rex < $iterations) {
my @h = map { int(rand($VecSize)) } 0 .. $VecMask;
my $bits=32;
my $collision=0;
for my $i (0 .. $num_vecs-1) {
if ($vectors[$i]->bit_test($h[$i])) {
$collision |= 1<<$i
}
else {
++$cnt_bits[$i]
}
$vectors[$i]->Bit_On($h[$i]);
}
if ($collision) {
++$cnt_colbits[$col2numbits[$collision]];
++$cnt_colbits[0]; # 'any' collision
++$cnt_coll;
$first_coll=$cnt_rex if !$first_coll;
if ($collision == $VecMask) {
++$cnt_coll_f;
$first_coll_f=$cnt_rex if !$first_coll_f;
}
}
if (exists $report_cnts{$cnt_rex}) {
my $ttl_bits=0;
$ttl_bits += $_ for @cnt_bits;
my $avg_bits = $ttl_bits / $num_vecs;
my $avg_pct_full = 100.0 * $ttl_bits / ($num_vecs*$VecSize);
my $calc_pct_full = 100.0 * (1-exp(-$cnt_rex/$VecSize));
printf "% 2u %10u: % 7u %6.2f %6.2f",
$VecSizeBits, $cnt_rex, $avg_bits, $avg_pct_full, $calc_pct_full;
print " ",
join(" ",
map {sprintf"% 7u",$_} @cnt_colbits
);
my $expected = (1-exp(-$cnt_rex/$VecSize))**$num_vecs;
printf " %.10f", $expected;
print "\n";
}
}
$ ./analyze_bloom.pl 10 3 13
NumVecs=3
avg avg calc --------- # collisions ---------
BI Samples one bits pct 1s pct 1s all 1 2 3 p(coll)
-- ---------- -------- ------ ------ ------- ------- ------- ------- ------------
10 256: 222 21.74 22.12 86 72 14 0 0.0108230772
10 512: 405 39.62 39.35 250 186 59 5 0.0609161842
10 1024: 647 63.22 63.21 699 351 265 83 0.2525804578
10 2048: 892 87.11 86.47 1711 494 677 540 0.6464623148
10 4096: 1012 98.83 98.17 3758 511 1000 2247 0.9460533270
$ ./analyze_bloom.pl 12 5 15
NumVecs=5
avg avg calc ----------------- # collisions -----------------
BI Samples one bits pct 1s pct 1s all 1 2 3 4 5 p(coll)
-- ---------- -------- ------ ------ ------- ------- ------- ------- ------- ------- ------------
12 256: 248 6.06 6.06 37 36 1 0 0 0 0.0000008164
12 512: 479 11.70 11.75 138 114 23 1 0 0 0.0000223999
12 1024: 908 22.17 22.12 434 311 103 17 3 0 0.0005295634
12 2048: 1604 39.17 39.35 1296 634 447 170 44 1 0.0094309292
12 4096: 2572 62.81 63.21 3282 915 1038 788 443 98 0.1009251903
12 8192: 3542 86.47 86.47 7372 979 1392 1775 1968 1258 0.4833243641
12 16384: 4020 98.15 98.17 15564 980 1420 2052 3718 7394 0.9117155503