use warnings; use strict; use Time::HiRes; if (@ARGV == 0) { print "Finds longest matching substring between each pair of a set of test\n"; print "strings in the given file. Pairs of lines are expected with the first\n"; print "of a pair being the string name and the second the test string."; exit (1); } my $minmatch = 10; my $startTime = [Time::HiRes::gettimeofday ()]; my %strings; while (<>) { chomp(my $label = $_); chomp(my $string = <>); # Compute all substrings @{$strings{$label}} = map [substr($string, $_), $label, $_], 0..(length($string) - $minmatch); } print "Loaded. Generating combos...\n"; my @keys = sort keys %strings; my @best_overall_match = (0); for my $ki1 (0..($#keys - 1)) { for my $ki2 (($ki1 + 1)..$#keys) { my @strings = sort {$a->[0] cmp $b->[0]} @{$strings{$keys[$ki1]}}, @{$strings{$keys[$ki2]}}; # Now walk through the list. The best match for each string will be the # previous or next element in the list that is not from the original substring, # so for each entry, just look for the next one. See how many initial letters # match and track the best matches my @matchdata = (0); # (length, index1-into-strings, index2-into-strings) for my $i1 (0..($#strings - 1)) { my $i2 = $i1 + 1; ++$i2 while $i2 <= $#strings and $strings[$i2][1] eq $strings[$i1][1]; next if $i2 > $#strings; my ($common) = map length, ($strings[$i1][0] ^ $strings[$i2][0]) =~ /^(\0*)/; next if $common < $minmatch; if ($common > $matchdata[0]) { @matchdata = ($common, [$i1, $i2]); } elsif ($common == $matchdata[0]) { push @matchdata, [$i1, $i2]; } } next if $matchdata[0] < $minmatch; if ($matchdata[0] > $best_overall_match[0]) { @best_overall_match = ($matchdata[0]); } if ($matchdata[0] >= $best_overall_match[0]) { push @best_overall_match, map { ["$strings[$_->[0]][1]:$strings[$_->[0]][2]", "$strings[$_->[1]][1]:$strings[$_->[1]][2]"] } @matchdata[1..$#matchdata]; } print "$keys[$ki1] and $keys[$ki2]: $matchdata[0] chars\n"; for my $i (@matchdata[1..$#matchdata]) { if ($strings[$i->[0]][1] eq $keys[$ki2]) { @{$i}[0,1] = @{$i}[1,0]; } print "... starting at $strings[$i->[0]][2] and $strings[$i->[1]][2], respectively.\n"; } } } print "Best overall match: $best_overall_match[0] chars\n"; print "$_->[0] and $_->[1]\n" for (@best_overall_match[1..$#best_overall_match]) ; print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n";