Another update fixing the last round of issues.
use strict;
use warnings;
use Time::HiRes;
use List::Util qw(min max);
use Math::Pari qw(divisors);
=head
Written
=cut
my $allLCS = 1;
my $subStrSize = 2; # Determines minimum match length. Should be less
+than half
# the minimum interesting match length. The larger this value is the f
+aster the
# search runs.
if (@ARGV == 0)
{
print "Finds longest matching substring between any pair of test s
+trings\n";
print "in the given file. Pairs of lines are expected with the fir
+st of a\n";
print "pair being the string name and the second the test string."
+;
exit (1);
}
print "Minimum match length is $subStrSize\n";
my @strings; # Outside the loop so subs see it
while (@ARGV)
{# process each file
# Read in the strings
my $filename = shift;
print "\nProcessing: $filename\n";
@strings = ();
open inFile, "< $filename";
while (<inFile>)
{
chomp;
my $strName = $_;
$_ = <inFile>;
chomp;
push @strings, [$strName, $_];
}
close inFile;
my $lastStr = @strings - 1;
my %bestMatches = ('len' => 0); # Best match details
my $longest = 0; # Best match length so far (unexpanded)
my $startTime = [Time::HiRes::gettimeofday ()];
# Do the search
for my $curStr (0..$lastStr)
{# each string
my ($sourceName, $source) = @{$strings[$curStr]};
my @subStrs = generatePatterns ($source);
my $lastSub = @subStrs-1;
for my $targetStr (($curStr+1)..$lastStr)
{# each remaining string
my ($targetName, $target) = @{$strings[$targetStr]};
my $targetLen = length $target;
my $localLongest = 0;
my @localBests = [(0, 0, 0, 0, 0)];
for my $i (0..$lastSub)
{
my $offset = 0;
while ($offset < $targetLen)
{
$offset = index $target, $subStrs[$i][0], $offset;
last if $offset < 0;
my $matchStr2 = substr $target, $offset;
my $slipage = 0;
my $bestSlip = 0;
my $matchLen = 0;
my $first = 1;
while ($first || $slipage < $subStrSize && $subStrs[
+$i][1] < $subStrSize)
{
my $matchStr1 = substr $source, $i * $subStrSize -
+ $slipage;
($matchStr1 ^ $matchStr2) =~ /^\0*/;
if ($matchLen < $+[0])
{
$bestSlip = $slipage;
$matchLen = $+[0];
}
$slipage += $subStrs[$i][1];
$first = 0;
}
next if $matchLen < $localLongest - $subStrSize + 1;
$localLongest = $matchLen;
my @test = ($curStr, $targetStr, $i * $subStrSize -
+$bestSlip, $offset, $matchLen);
@test = expandMatch (@test);
my $dm = $test[4] - $localBests[-1][4];
@localBests = () if $dm > 0;
push @localBests, [@test] if $dm >= 0;
$offset = $test[3] + $test[4];
next if $test[4] < $longest;
$longest = $test[4];
$dm = $longest - $bestMatches{'len'};
next if $dm < 0;
%bestMatches = ('len' => $test[4]) if $dm > 0;
$bestMatches{"$test[0],$test[1],$test[2],$test[3]"}
+= $test[4];
$bestMatches{'len'} = $test[4];
}
continue {++$offset;}
}
next if ! $allLCS;
if (! @localBests)
{
print "Didn't find LCS for $sourceName and $targetName\n
+";
next;
}
for (@localBests)
{
my @curr = @$_;
printf "%03d:%03d L[%4d] (%4d %4d)\n",
$curr[0], $curr[1], $curr[4], $curr[2], $curr[3];
}
}
}
print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n"
+;
my $len = $bestMatches{'len'};
for (keys %bestMatches)
{
next if $_ eq 'len';
my @curr = split ',', $_;
printf "Best match: %s - %s. %d characters starting at %d and %d.\
+n",
$strings[$curr[0]][0], $strings[$curr[1]][0], $len, $curr[2], $cur
+r[3];
}
}
sub expandMatch
{
my ($index1, $index2, $str1Start, $str2Start, $matchLen) = @_;
my $maxMatch = max (0, min ($str1Start, $subStrSize + 10, $str2Start))
+;
my $matchStr1 = substr ($strings[$index1][1], $str1Start - $maxMatch,
+$maxMatch);
my $matchStr2 = substr ($strings[$index2][1], $str2Start - $maxMatch,
+$maxMatch);
($matchStr1 ^ $matchStr2) =~ /\0*$/;
my $adj = $+[0] - $-[0];
$matchLen += $adj;
$str1Start -= $adj;
$str2Start -= $adj;
return ($index1, $index2, $str1Start, $str2Start, $matchLen);
}
sub generatePatterns
{
my @subStrs;
my $source = shift;
my %strs;
for (my $i = 0; $i < (length $source) - $subStrSize + 1; $i += $subStr
+Size)
{
my $substr = substr $source, $i, $subStrSize;
my ($cycleLen, $str) = findCycle ($substr);
push @subStrs, [$substr, $cycleLen];
}
#push @subStrs, [$_, $strs{$_}] for keys %strs;
return @subStrs;
}
sub findCycle
{
my $str = shift;
my $copy = $str;
my $cycleLen = 0;
my $strLen = length ($copy);
for (0..($strLen - 1))
{
$copy .= substr $copy, 0, 1, '';
$cycleLen = $_ + 1;
($str ^ $copy) =~ /^\0*/;
return wantarray ? ($cycleLen, substr $str, 0, $cycleLen)
: $cycleLen if $+[0] == $strLen;
}
return wantarray ? ($strLen, $str) : $strLen;
}
sub findCycle_1
{
my $str = shift;
my $strLen = length $str;
for ( @{ divisors( $strLen ) } )
{
my $copy = $str;
$copy .= substr( $copy, 0, $_, '' );
return wantarray ? ($_, substr $str, 0, $_)
: $_ if $str eq $copy;
}
}