use strict; use warnings; use Time::HiRes; use List::Util qw(min max); use Math::Pari qw(divisors); =head Written =cut my $allLCS = 1; my $subStrSize = 2; # Determines minimum match length. Should be less than half # the minimum interesting match length. The larger this value is the faster the # search runs. if (@ARGV == 0) { print "Finds longest matching substring between any pair of test strings\n"; print "in the given file. Pairs of lines are expected with the first of a\n"; print "pair being the string name and the second the test string."; exit (1); } print "Minimum match length is $subStrSize\n"; my @strings; # Outside the loop so subs see it while (@ARGV) {# process each file # Read in the strings my $filename = shift; print "\nProcessing: $filename\n"; @strings = (); open inFile, "< $filename"; while () { chomp; my $strName = $_; $_ = ; chomp; push @strings, [$strName, $_]; } close inFile; my $lastStr = @strings - 1; my %bestMatches = ('len' => 0); # Best match details my $longest = 0; # Best match length so far (unexpanded) my $startTime = [Time::HiRes::gettimeofday ()]; # Do the search for my $curStr (0..$lastStr) {# each string my ($sourceName, $source) = @{$strings[$curStr]}; my @subStrs = generatePatterns ($source); my $lastSub = @subStrs-1; for my $targetStr (($curStr+1)..$lastStr) {# each remaining string my ($targetName, $target) = @{$strings[$targetStr]}; my $targetLen = length $target; my $localLongest = 0; my @localBests = [(0, 0, 0, 0, 0)]; for my $i (0..$lastSub) { my $offset = 0; while ($offset < $targetLen) { $offset = index $target, $subStrs[$i][0], $offset; last if $offset < 0; my $matchStr2 = substr $target, $offset; my $slipage = 0; my $bestSlip = 0; my $matchLen = 0; my $first = 1; while ($first || $slipage < $subStrSize && $subStrs[$i][1] < $subStrSize) { my $matchStr1 = substr $source, $i * $subStrSize - $slipage; ($matchStr1 ^ $matchStr2) =~ /^\0*/; if ($matchLen < $+[0]) { $bestSlip = $slipage; $matchLen = $+[0]; } $slipage += $subStrs[$i][1]; $first = 0; } next if $matchLen < $localLongest - $subStrSize + 1; $localLongest = $matchLen; my @test = ($curStr, $targetStr, $i * $subStrSize - $bestSlip, $offset, $matchLen); @test = expandMatch (@test); my $dm = $test[4] - $localBests[-1][4]; @localBests = () if $dm > 0; push @localBests, [@test] if $dm >= 0; $offset = $test[3] + $test[4]; next if $test[4] < $longest; $longest = $test[4]; $dm = $longest - $bestMatches{'len'}; next if $dm < 0; %bestMatches = ('len' => $test[4]) if $dm > 0; $bestMatches{"$test[0],$test[1],$test[2],$test[3]"} = $test[4]; $bestMatches{'len'} = $test[4]; } continue {++$offset;} } next if ! $allLCS; if (! @localBests) { print "Didn't find LCS for $sourceName and $targetName\n"; next; } for (@localBests) { my @curr = @$_; printf "%03d:%03d L[%4d] (%4d %4d)\n", $curr[0], $curr[1], $curr[4], $curr[2], $curr[3]; } } } print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n"; my $len = $bestMatches{'len'}; for (keys %bestMatches) { next if $_ eq 'len'; my @curr = split ',', $_; printf "Best match: %s - %s. %d characters starting at %d and %d.\n", $strings[$curr[0]][0], $strings[$curr[1]][0], $len, $curr[2], $curr[3]; } } sub expandMatch { my ($index1, $index2, $str1Start, $str2Start, $matchLen) = @_; my $maxMatch = max (0, min ($str1Start, $subStrSize + 10, $str2Start)); my $matchStr1 = substr ($strings[$index1][1], $str1Start - $maxMatch, $maxMatch); my $matchStr2 = substr ($strings[$index2][1], $str2Start - $maxMatch, $maxMatch); ($matchStr1 ^ $matchStr2) =~ /\0*$/; my $adj = $+[0] - $-[0]; $matchLen += $adj; $str1Start -= $adj; $str2Start -= $adj; return ($index1, $index2, $str1Start, $str2Start, $matchLen); } sub generatePatterns { my @subStrs; my $source = shift; my %strs; for (my $i = 0; $i < (length $source) - $subStrSize + 1; $i += $subStrSize) { my $substr = substr $source, $i, $subStrSize; my ($cycleLen, $str) = findCycle ($substr); push @subStrs, [$substr, $cycleLen]; } #push @subStrs, [$_, $strs{$_}] for keys %strs; return @subStrs; } sub findCycle { my $str = shift; my $copy = $str; my $cycleLen = 0; my $strLen = length ($copy); for (0..($strLen - 1)) { $copy .= substr $copy, 0, 1, ''; $cycleLen = $_ + 1; ($str ^ $copy) =~ /^\0*/; return wantarray ? ($cycleLen, substr $str, 0, $cycleLen) : $cycleLen if $+[0] == $strLen; } return wantarray ? ($strLen, $str) : $strLen; } sub findCycle_1 { my $str = shift; my $strLen = length $str; for ( @{ divisors( $strLen ) } ) { my $copy = $str; $copy .= substr( $copy, 0, $_, '' ); return wantarray ? ($_, substr $str, 0, $_) : $_ if $str eq $copy; } }