Re^3: A better implementation of LCSS? (testing ... 1,2,3)

in reply to Re^2: A better implementation of LCSS? (Yes)
in thread A better implementation of LCSS?

What we need are tests... lots of tests. I created a test script (lcss.pl) and a test input file (test.txt) which can be used to check the functionality of the various CPAN modules and other code I've stumbled upon. The focus of the testing is correct, basic functionality (not performance).

The test data is a collection of all the CPAN test files for the 3 modules (Algorithm::LCSS, String::LCSS, String::LCSS_XS). I added the checks which caused problems for the BrowserUk code, plus a few extras so far.

The test script can be used to run these checks on the various LCSS subs, one at a time. I included the 3 variants of BrowserUk's code, Perl code I found on wikipedia, plus this compact regex version: Longest common substring. The wiki, the regex and String::LCSS_XS pass all checks.

What I'd really like is a way to generate tests automatically. Generating input strings is straightforward, but generating expect values is tricky without a reference model. I tried to stfw for ready-made groups of input strings and expect values, but found nothing. I may just go ahead and use one of the 3 that have no known bugs yet as a reference model.

Here is the lcss.pl script:

#!/usr/bin/env perl

=head1 NAME

B<lcss.pl> - Run Longest Common SubString tests

=head1 SYNOPSIS

lcss.pl file ...

=head1 DESCRIPTION

Input is a file, files or STDIN.  Output is to STDOUT.

Example: lcss.pl test.txt

=cut

use warnings FATAL => 'all';
use strict;
use String::LCSS_XS qw();
use String::LCSS    qw();

sub lcssN1 (\$\$;$);
sub lcssN2 (\$\$;$);
sub lcssN3 (\$\$;$);

my $cnt = 0;
my %data;
while (<>) {
    chomp;
    if (s/^([stmec])=//) {
        my $param = $1;
        if ($param eq 'e') {
            push @{ $data{$param} }, $_;
        }
        elsif ($param eq 'c') {
            for my $p (qw(s t e)) {
                die "ERROR in input file: missing $p" unless exists $d
+ata{$p};
            }
#            my $actual = String::LCSS_XS::lcss($data{s}, $data{t});
#            my $actual = String::LCSS::lcss($data{s}, $data{t});
#            my $actual = longest_common_substr($data{s}, $data{t});
            my $actual = lc_substr($data{s}, $data{t});
#            my $actual = lcssN1($data{s}, $data{t});
#            my $actual = lcssN2($data{s}, $data{t});
#            my $actual = lcssN3($data{s}, $data{t});
            $actual = 'undef' unless defined $actual;
            check($actual);
            %data = ();
            $cnt++;
        }
        else {
            $data{$param} = $_;
        }
    }
}
print "cnt=$cnt\n";
die "ERROR: No checks performed\n" unless $cnt;
exit;


sub check {
    my $actual = shift;
    my $pass = 0;
    my @expects = @{ $data{e} };
    my $num = @expects;
    for my $expect (@expects) {
        $pass++ if $actual eq $expect;
    }
    unless ($pass) {
        my $msg = exists $data{m} ? $data{m} : '';
        warn "FAILED:";
        print "actual='$actual'\n";
        for my $expect (@expects) {
            print "expect='$expect'\n";
        }
        print "s='$data{s}'\n";
        print "t='$data{t}'\n";
        print "m=$msg\n\n";
    }
}

sub longest_common_substr {
    #japhy
  # provided you know there are no NULs
  my $str = join "\0", @_;
  my $len = 1;
  my $match;

  while ($str =~ m{ ([^\0]{$len,}) (?= [^\0]* \0 [^\0]*? \1 ) }x) {
    $len = length($match = $1) + 1;
  }

  return $match;
}

sub lcssN3 (\$\$;$) {
    # BrowserUk, rev. 3
    my( $ref1, $ref2, $min ) = @_;
    my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 );
    ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1
+ ) if $l1 > $l2;
    $min = 1 unless defined $min;

    my $mask = $$ref1 x ( int( $l2 / $l1 ) );

    my @match = '';
    for my $start ( 0 .. $l1-1 ) {
        my $masked = substr( $mask, $start, $l2 ) ^ $$ref2;
        while( $masked =~ m[\0{$min,}]go ) {
            my $l = $+[ 0 ] - $-[ 0 ];
            my $match = substr( $$ref2, $-[ 0 ], $l );
            @match = (
                $match,
                ( $-[ 0 ]+$start ) % $l1,
                $-[ 0 ]
            ) if $l > length $match[ 0 ];
        }
    }
    @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped;
    return unless $match[ 0 ];
    return wantarray ? @match : $match[ 0 ];
}
sub lcssN2 (\$\$;$) {
    # BrowserUk, rev. 2
    my( $ref1, $ref2, $min ) = @_;
    my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 );
    ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1
+ ) if $l1 > $l2;
    $min = 1 unless defined $min;

    my $mask = $$ref1 x ( int( $l2 / $l1 ) + 1 );

    my @match = '';
    for my $start ( 0 .. $l1-1 ) {
        my $masked = substr( $mask, $start, $l2 ) ^ $$ref2;
        while( $masked =~ m[\0{$min,}]go ) {
            my $l = $+[ 0 ] - $-[ 0 ];
            my $match = substr( $$ref2, $-[ 0 ], $l );
            next unless 1+index $$ref1, $match;
            @match = (
                $match,
                ( $-[ 0 ]+$start ) % $l1,
                $-[ 0 ]
            ) if $l > length $match[ 0 ];
        }
    }
    @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped;
    return unless $match[ 0 ];
    return wantarray ? @match : $match[ 0 ];
}
sub lcssN1 (\$\$;$) {
    # BrowserUk, rev. 1
    my( $ref1, $ref2, $min ) = @_;
    my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 );
    ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1
+ ) if $l1 > $l2;
    $min = 1 unless defined $min;

    my $mask = $$ref1 x ( int( $l2 / $l1 ) + 1 );

    my @match = '';
    for my $start ( 0 .. $l1-1 ) {
        my $masked = substr( $mask, $start, $l2 ) ^ $$ref2;
        while( $masked =~ m[\0{$min,}]go ) {
            @match = (
                substr( $$ref2, $-[ 0 ], $+[ 0 ] - $-[ 0 ] ),
                ( $-[ 0 ]+$start ) % $l1,
                $-[ 0 ]
            ) if ( $+[ 0 ] - $-[ 0 ] ) > length $match[ 0 ];
        }
    }
    @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped;
    return unless $match[ 0 ];
    return wantarray ? @match : $match[ 0 ];
}

sub lc_substr {
    # wiki
  my ($str1, $str2) = @_; 
  my $l_length = 0; # length of longest common substring
  my $len1 = length $str1; 
  my $len2 = length $str2; 
  my @char1 = (undef, split(//, $str1)); # $str1 as array of chars, in
+dexed from 1
  my @char2 = (undef, split(//, $str2)); # $str2 as array of chars, in
+dexed from 1
  my @lc_suffix; # "longest common suffix" table
  my @substrings; # list of common substrings of length $l_length
 
  for my $n1 ( 1 .. $len1 ) { 
    for my $n2 ( 1 .. $len2 ) { 
      if ($char1[$n1] eq $char2[$n2]) {
        # We have found a matching character. Is this the first matchi
+ng character, or a
    # continuation of previous matching characters? If the former, the
+n the length of
    # the previous matching portion is undefined; set to zero.
        $lc_suffix[$n1-1][$n2-1] ||= 0;
    # In either case, declare the match to be one character longer tha
+n the match of
    # characters preceding this character.
        $lc_suffix[$n1][$n2] = $lc_suffix[$n1-1][$n2-1] + 1;
    # If the resulting substring is longer than our previously recorde
+d max length ...
        if ($lc_suffix[$n1][$n2] > $l_length) {
      # ... we record its length as our new max length ...
          $l_length = $lc_suffix[$n1][$n2];
      # ... and clear our result list of shorter substrings.
          @substrings = ();
        }
    # If this substring is equal to our longest ...
        if ($lc_suffix[$n1][$n2] == $l_length) {
      # ... add it to our list of solutions.
          push @substrings, substr($str1, ($n1-$l_length), $l_length);
        }
      }
    }
  }   
 
  return $substrings[0];
#  return @substrings;
}

__END__

Input file sytnax:
- lines starting with s= define the 1st input string
- lines starting with t= define the 2nd input string
- lines starting with m= define the test message (optional)
- lines starting with e= define the expected output
- lines starting with c= trigger the check
- all other lines are ignored and can be used for comments
- whitespace is significant, except for the trailing newline
- limited to "printable" ascii: space to tilde
[download]

Here is the test data file test.txt:

# Basic tests
# All input string pairs have at least one common substring

s=abcde
t=bcd77
e=bcd
c=

m= rt52839 Algorithm::LCSS bug
s=CAGAGTTCTACAGTCCGACGATCACTAA
t=ACCGACGATCACTATCGTACGACTCTTAGCAAGCAGA
e=CCGACGATCACTA
c=

m= rt32036 String::LCSS bug
s=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 18 19 20 21 22 23 7 24
t=1 2 3 4 5 7 8 9 11 12 13 10 14 15 16 17 2 18 19 20 21 22 23 7 24
e= 14 15 16 17 2 18 19 20 21 22 23 7 24
c=

m= rt62175 String::LCSS bug
s=the quick brown fox jumped over the lazy dog
t=I saw a quick brown fox and jumped over the lazy dog
e= jumped over the lazy dog
c=

m= BrowserUk lcssN bug
s=xxxyyxxy
t=yyyxyxx
e=yyx
c=

m= BrowserUk lcssN bug
s=abcdefg
t=abcdefga
e=abcdefg
c=

m= Algorithm::LCSS cpan test.pl
s=abcdefghijklmnopqrstuvwxyz
t=flubberabcdubberdofghijklm
e=fghijklm
c=

# String::LCSS cpan test (t/01strings.t)

s=xyzzx
t=abcxyzefg
e=xyz
c=

s=abcxyzzx
t=abcxyzefg
e=abcxyz
c=

s=i pushed the lazy dog into a creek, the quick brown fox told me to
t=the quick brown fox jumps over the lazy dog
e=the quick brown fox 
c=

m= reverse of above s and t
s=the quick brown fox jumps over the lazy dog
t=i pushed the lazy dog into a creek, the quick brown fox told me to
e=the quick brown fox 
c=

s=i pushed the lazy dog into a creek, the quick brown fox told me to
t=why did the quick brown fox jumps over the lazy dog
e= the quick brown fox 
c=


# String::LCSS_XS cpan test (t/10.lcss.t)

s=xyzzx
t=abcxyzefg
e=xyz
c=

s=abcxyzzx
t=abcxyzefg
e=abcxyz
c=

# LCSS_XS=a, japhy=f, browseruk=b
m= 3 possible lcss depending on order: a b f
s=foobar
t=abcxyzefg
e=a
e=b
e=f
c=

s=ABBAGGG
t=HHHHZZAB
e=AB
c=

m= reverse s and t from above
s=HHHHZZAB
t=ABBAGGG
e=AB
c=

s=zyzxx
t=abczyzefg
e=zyz
c=

m= "bug"/"feature" in String::LCSS
s=b
t=ab
e=b
c=

# end String::LCSS_XS cpan test (t/10.lcss.t)

s=____  ___
t=dhfgdja___ 000ghakj
e=___ 
c=

s=111000111
t=10130000555
e=000
c=

s=+=-_)(*&^%$#@!~`|\]}{["';:?/.><, zxcvbnm
t=     xcv-----
e=xcv
c=

m= s=3space,t=7space,e=3space
s=   
t=       
e=   
c=

s=111
t=111
e=111
c=

s=abc
t=cba000
e=a
e=b
e=c
c=
[download]

In Section Seekers of Perl Wisdom