use strict ; use warnings ; # Dummies for quick test. open my $FH, '<', &dummy_input or die "horribly $!" ; sub extract_key { my ($line) = @_ ; return (split(/:/, $line))[0] ; } ; # In order to get a false positive out of the digest, this mask is used. # (Should be removed for real use !) use constant SMALL_DIGEST => 0xA11A5885 ; #----------------------------------------------------------------------------------------- # Step 1. Identify digests for possible duplicate keys. # # NB: may identify the same digest value a number of times. my $cand_file = '' ; open my $CAND, '>', \$cand_file ; # Can write to a real file if required ! alloc_bm() ; # swallows 512M memory ! my $line_count = 0 ; my $cand_count = 0 ; while (my $line = <$FH>) { $line_count++ ; my $d = digest(extract_key($line)) ; if (setbit($d)) { print $CAND pack('N', $d) ; $cand_count++ ; } ; } ; dealloc_bm() ; close $CAND ; print STDERR "Read $line_count lines and identified $cand_count candidate digests\n" ; #----------------------------------------------------------------------------------------- # Step 2. Construct the candidate hash. # # NB: this is assuming that there will be a relatively small number of # candidates -- in particular assuming that a hash will be much smaller # than another bitmap ! my %candidates = () ; my $cand_check = 0 ; open $CAND, '<', \$cand_file ; while (1) { my $ds ; my $rc = read $CAND, $ds, 4 ; if (!defined($rc)) { die "failed reading candidates $!" ; } ; last if $rc == 0 ; if ($rc != 4) { die "unexpected EOF reading candidates" ; } ; $cand_check++ ; $candidates{$ds} = undef ; } ; print STDERR "Read $cand_check candidate digests\n" ; if ($cand_check != $cand_count) { die "expected $cand_count candidates" ; } ; close $CAND ; $cand_file = undef ; # Release memory ! #----------------------------------------------------------------------------------------- # Step 3. Identify the duplicate keys and related records. # # NB: this is assuming that this is now a small enough problem ! my %key_hash = () ; my $line_number = 0 ; seek $FH, 0, 0 or die "failed to seek to start of input $!" ; while (my $line = <$FH>) { my $key = extract_key($line) ; my $ds = pack('N', digest($key)) ; if (exists($candidates{$ds})) { push @{$key_hash{$key}}, $line_number ; } ; $line_number++ ; } ; if ($line_number != $line_count) { die "Pass 1 read $line_count lines. Pass 2 read $line_number !" ; } ; close $FH ; #----------------------------------------------------------------------------------------- # Step 4. Discard the not duplicates after all. # # NB: we are here discarding keys which had the same digest as some other key, # but are not equal to any other key. my $discard = 0 ; foreach my $key (keys %key_hash) { if (@{$key_hash{$key}} == 1) { $discard++ ; delete $key_hash{$key} ; } ; } ; print STDERR "Discarded $discard 'not duplicate after all' keys\n" ; #----------------------------------------------------------------------------------------- # Step 5. Process the duplicates # # %key_hash now contains the actual duplicate keys, and for each one a list of # line numbers for the key. print "Indentified:\n" ; foreach my $k (sort keys(%key_hash)) { print STDERR "$k: ", join(', ', @{$key_hash{$k}}), "\n" ; } ; ########################################################################################## # # Bit Map Handling -- 32-bit address space # # We divide the bitmap up into chunks in an array -- to keep individual vec strings to # some 'reasonable' size. # # BMX = number of bits to identify which chunk # BMM = mask to extract chunk number # # BM_COUNT = number of chunks # BM_SIZE = size of each chunk, in bytes use constant BITS => 32 ; use constant BMS => 16 ; # 2**16 = 64K -- size of chunk in BYTES use constant BMX => BMS + 3 ; # Shift to get chunk index use constant BMM => (1 << BMX) - 1 ; # Mask to get bit index use constant BM_COUNT => 1 << (BITS - BMX) ; # Number or chunks in bitmap use constant BM_SIZE => 1 << BMS ; # Size of chunk in BYTES my @bitmap = () ; # The bitmap use String::CRC32 qw(crc32) ; # our hash/digest function #========================================================================================= # alloc_bm: allocate empty bitmap # # Requires: Nothing # # Returns: Nothing sub alloc_bm { $#bitmap = BM_COUNT - 1 ; # pre-extend the array for my $i (0..BM_COUNT - 1) { # fill array with zeroed vec strings $bitmap[$i] = "\0" x BM_SIZE ; } ; } ; #========================================================================================= # dealloc_bm: deallocate bitmap # # Requires: Nothing # # Returns: Nothing sub dealloc_bm { undef @bitmap ; } ; #========================================================================================= # digest: calculate digest # # Requires: $key -- key string # # Returns: $d -- digest value: 32bits of unsigned integer sub digest { my ($key) = @_ ; return crc32($key) & SMALL_DIGEST ; } ; #========================================================================================= # setbit: set bit in bitmap, and return whether was set before. # # Requires: $d -- bitmap index # # Returns: 0 or 1 -- previous bitmap value sub setbit { my ($d) = @_ ; my $vec = $d >> BMX ; my $bit = $d & BMM ; my $was = vec($bitmap[$vec], $bit, 1) ; if (!$was) { vec($bitmap[$vec], $bit, 1) = 1 ; } ; return $was ; } ; ########################################################################################## # Dummy input sub dummy_input { \(<<'~~FILE') } abcdefg: line 0 gdshdff: line 1 djdkled: line 2 hhqmshq: line 3 -- false duplicate (with SMALL_DIGEST) zyxwvut: line 4 abcdefg: line 5 -- duplicate bbewyeb: line 6 piuewpi: line 7 zyxwvut: line 8 -- duplicate hfdjhjh: line 9 abcdefg: line 10 -- duplicate lsjquor: line 11 -- false duplicate (with SMALL_DIGEST) shargfd: line 12 ~~FILE ##########################################################################################