use strict ; use warnings ;

# Dummies for quick test.

open my $FH, '<', &dummy_input or die "horribly $!" ;

sub extract_key {
  my ($line) = @_ ;
  return (split(/:/, $line))[0] ;
} ;

# In order to get a false positive out of the digest, this mask is used.
# (Should be removed for real use !)

use constant SMALL_DIGEST => 0xA11A5885 ;

#-----------------------------------------------------------------------------------------
# Step 1.  Identify digests for possible duplicate keys.
#
#          NB: may identify the same digest value a number of times.

my $cand_file = '' ;
open my $CAND, '>', \$cand_file ;       # Can write to a real file if required !

alloc_bm() ;                            # swallows 512M memory !

my $line_count = 0 ;
my $cand_count = 0 ;

while (my $line = <$FH>) {
  $line_count++ ;
  my $d = digest(extract_key($line)) ;

  if (setbit($d)) { print $CAND pack('N', $d) ; $cand_count++ ; } ;
} ;

dealloc_bm() ;

close $CAND ;

print STDERR "Read $line_count lines and identified $cand_count candidate digests\n" ;


#-----------------------------------------------------------------------------------------
# Step 2.  Construct the candidate hash.
#
#          NB: this is assuming that there will be a relatively small number of 
#              candidates -- in particular assuming that a hash will be much smaller
#              than another bitmap !

my %candidates = () ;
my $cand_check = 0 ;

open $CAND, '<', \$cand_file ;

while (1) {
  my $ds ;
  my $rc = read $CAND, $ds, 4 ;
  if (!defined($rc)) { die "failed reading candidates $!" ;      } ;
  last if $rc == 0 ; 
  if ($rc != 4)      { die "unexpected EOF reading candidates" ; } ;

  $cand_check++ ;

  $candidates{$ds} = undef ;
} ;

print STDERR "Read $cand_check candidate digests\n" ;

if ($cand_check != $cand_count) { die "expected $cand_count candidates" ; } ;

close $CAND ;

$cand_file = undef ;            # Release memory !

#-----------------------------------------------------------------------------------------
# Step 3.  Identify the duplicate keys and related records.
#
#          NB: this is assuming that this is now a small enough problem !

my %key_hash = () ;

my $line_number = 0 ;

seek $FH, 0, 0   or die "failed to seek to start of input $!" ;

while (my $line = <$FH>) {
  my $key = extract_key($line) ;
  my $ds  = pack('N', digest($key)) ;

  if (exists($candidates{$ds})) {
    push @{$key_hash{$key}}, $line_number ;
  } ;

  $line_number++ ;
} ;

if ($line_number != $line_count) {
  die "Pass 1 read $line_count lines.  Pass 2 read $line_number !" ;
} ;

close $FH ;

#-----------------------------------------------------------------------------------------
# Step 4.  Discard the not duplicates after all.
#
#          NB: we are here discarding keys which had the same digest as some other key,
#              but are not equal to any other key.

my $discard = 0 ;

foreach my $key (keys %key_hash) {
  if (@{$key_hash{$key}} == 1) {
    $discard++ ;
    delete $key_hash{$key} ;
  } ;
} ;

print STDERR "Discarded $discard 'not duplicate after all' keys\n" ;

#-----------------------------------------------------------------------------------------
# Step 5.  Process the duplicates
#
#          %key_hash now contains the actual duplicate keys, and for each one a list of
#          line numbers for the key.

print "Indentified:\n" ;

foreach my $k (sort keys(%key_hash)) {
  print STDERR "$k: ", join(', ', @{$key_hash{$k}}), "\n" ;
} ;

##########################################################################################
#
# Bit Map Handling -- 32-bit address space
#
# We divide the bitmap up into chunks in an array -- to keep individual vec strings to
# some 'reasonable' size.
#
#    BMX = number of bits to identify which chunk
#    BMM = mask to extract chunk number
#
#    BM_COUNT  = number of chunks
#    BM_SIZE   = size of each chunk, in bytes

use constant BITS => 32 ;
use constant BMS  => 16 ;		# 2**16 = 64K -- size of chunk in BYTES

use constant BMX  => BMS + 3 ;		# Shift to get chunk index
use constant BMM  => (1 << BMX) - 1 ;	# Mask to get bit index

use constant BM_COUNT  =>  1 << (BITS - BMX) ;
					# Number or chunks in bitmap
use constant BM_SIZE   =>  1 << BMS ;	# Size of chunk in BYTES

my @bitmap       = () ;			# The bitmap

use String::CRC32 qw(crc32) ;           # our hash/digest function

#=========================================================================================
# alloc_bm: allocate empty bitmap
#
# Requires: Nothing
#
# Returns:  Nothing

sub alloc_bm {
  $#bitmap = BM_COUNT - 1 ;         # pre-extend the array
  for my $i (0..BM_COUNT - 1) {     # fill array with zeroed vec strings
    $bitmap[$i] = "\0" x BM_SIZE ;
  } ;
} ;

#=========================================================================================
# dealloc_bm: deallocate bitmap
#
# Requires: Nothing
#
# Returns:  Nothing

sub dealloc_bm {
  undef @bitmap ;
} ;

#=========================================================================================
# digest: calculate digest
#
# Requires: $key     -- key string
#
# Returns:  $d       -- digest value: 32bits of unsigned integer

sub digest {
  my ($key) = @_ ;
  return crc32($key) & SMALL_DIGEST ;
} ;

#=========================================================================================
# setbit: set bit in bitmap, and return whether was set before.
#
# Requires: $d     -- bitmap index
# 
# Returns:  0 or 1 -- previous bitmap value

sub setbit {
  my ($d) = @_ ;

  my $vec = $d >> BMX ;
  my $bit = $d & BMM ;

  my $was = vec($bitmap[$vec], $bit, 1) ;

  if (!$was) { vec($bitmap[$vec], $bit, 1) = 1 ; } ;

  return $was ;
} ;

##########################################################################################
# Dummy input
sub dummy_input { \(<<'~~FILE') }
abcdefg: line  0
gdshdff: line  1
djdkled: line  2
hhqmshq: line  3 -- false duplicate (with SMALL_DIGEST)
zyxwvut: line  4
abcdefg: line  5 -- duplicate
bbewyeb: line  6
piuewpi: line  7
zyxwvut: line  8 -- duplicate
hfdjhjh: line  9
abcdefg: line 10 -- duplicate
lsjquor: line 11 -- false duplicate (with SMALL_DIGEST)
shargfd: line 12
~~FILE
##########################################################################################