#!/usr/bin/env perl =head1 NAME B - Run Longest Common SubString tests =head1 SYNOPSIS lcss.pl file ... =head1 DESCRIPTION Input is a file, files or STDIN. Output is to STDOUT. Example: lcss.pl test.txt =cut use warnings FATAL => 'all'; use strict; use String::LCSS_XS qw(); use String::LCSS qw(); sub lcssN1 (\$\$;$); sub lcssN2 (\$\$;$); sub lcssN3 (\$\$;$); my $cnt = 0; my %data; while (<>) { chomp; if (s/^([stmec])=//) { my $param = $1; if ($param eq 'e') { push @{ $data{$param} }, $_; } elsif ($param eq 'c') { for my $p (qw(s t e)) { die "ERROR in input file: missing $p" unless exists $data{$p}; } # my $actual = String::LCSS_XS::lcss($data{s}, $data{t}); # my $actual = String::LCSS::lcss($data{s}, $data{t}); # my $actual = longest_common_substr($data{s}, $data{t}); my $actual = lc_substr($data{s}, $data{t}); # my $actual = lcssN1($data{s}, $data{t}); # my $actual = lcssN2($data{s}, $data{t}); # my $actual = lcssN3($data{s}, $data{t}); $actual = 'undef' unless defined $actual; check($actual); %data = (); $cnt++; } else { $data{$param} = $_; } } } print "cnt=$cnt\n"; die "ERROR: No checks performed\n" unless $cnt; exit; sub check { my $actual = shift; my $pass = 0; my @expects = @{ $data{e} }; my $num = @expects; for my $expect (@expects) { $pass++ if $actual eq $expect; } unless ($pass) { my $msg = exists $data{m} ? $data{m} : ''; warn "FAILED:"; print "actual='$actual'\n"; for my $expect (@expects) { print "expect='$expect'\n"; } print "s='$data{s}'\n"; print "t='$data{t}'\n"; print "m=$msg\n\n"; } } sub longest_common_substr { #japhy # provided you know there are no NULs my $str = join "\0", @_; my $len = 1; my $match; while ($str =~ m{ ([^\0]{$len,}) (?= [^\0]* \0 [^\0]*? \1 ) }x) { $len = length($match = $1) + 1; } return $match; } sub lcssN3 (\$\$;$) { # BrowserUk, rev. 3 my( $ref1, $ref2, $min ) = @_; my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 ); ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1 ) if $l1 > $l2; $min = 1 unless defined $min; my $mask = $$ref1 x ( int( $l2 / $l1 ) ); my @match = ''; for my $start ( 0 .. $l1-1 ) { my $masked = substr( $mask, $start, $l2 ) ^ $$ref2; while( $masked =~ m[\0{$min,}]go ) { my $l = $+[ 0 ] - $-[ 0 ]; my $match = substr( $$ref2, $-[ 0 ], $l ); @match = ( $match, ( $-[ 0 ]+$start ) % $l1, $-[ 0 ] ) if $l > length $match[ 0 ]; } } @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped; return unless $match[ 0 ]; return wantarray ? @match : $match[ 0 ]; } sub lcssN2 (\$\$;$) { # BrowserUk, rev. 2 my( $ref1, $ref2, $min ) = @_; my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 ); ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1 ) if $l1 > $l2; $min = 1 unless defined $min; my $mask = $$ref1 x ( int( $l2 / $l1 ) + 1 ); my @match = ''; for my $start ( 0 .. $l1-1 ) { my $masked = substr( $mask, $start, $l2 ) ^ $$ref2; while( $masked =~ m[\0{$min,}]go ) { my $l = $+[ 0 ] - $-[ 0 ]; my $match = substr( $$ref2, $-[ 0 ], $l ); next unless 1+index $$ref1, $match; @match = ( $match, ( $-[ 0 ]+$start ) % $l1, $-[ 0 ] ) if $l > length $match[ 0 ]; } } @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped; return unless $match[ 0 ]; return wantarray ? @match : $match[ 0 ]; } sub lcssN1 (\$\$;$) { # BrowserUk, rev. 1 my( $ref1, $ref2, $min ) = @_; my( $swapped, $l1, $l2 ) = ( 0, map length( $$_ ), $ref1, $ref2 ); ( $l2, $ref2, $l1, $ref1, $swapped ) = ( $l1, $ref1, $l2, $ref2, 1 ) if $l1 > $l2; $min = 1 unless defined $min; my $mask = $$ref1 x ( int( $l2 / $l1 ) + 1 ); my @match = ''; for my $start ( 0 .. $l1-1 ) { my $masked = substr( $mask, $start, $l2 ) ^ $$ref2; while( $masked =~ m[\0{$min,}]go ) { @match = ( substr( $$ref2, $-[ 0 ], $+[ 0 ] - $-[ 0 ] ), ( $-[ 0 ]+$start ) % $l1, $-[ 0 ] ) if ( $+[ 0 ] - $-[ 0 ] ) > length $match[ 0 ]; } } @match[ 2, 1 ] = @match[ 1, 2 ] if $swapped; return unless $match[ 0 ]; return wantarray ? @match : $match[ 0 ]; } sub lc_substr { # wiki my ($str1, $str2) = @_; my $l_length = 0; # length of longest common substring my $len1 = length $str1; my $len2 = length $str2; my @char1 = (undef, split(//, $str1)); # $str1 as array of chars, indexed from 1 my @char2 = (undef, split(//, $str2)); # $str2 as array of chars, indexed from 1 my @lc_suffix; # "longest common suffix" table my @substrings; # list of common substrings of length $l_length for my $n1 ( 1 .. $len1 ) { for my $n2 ( 1 .. $len2 ) { if ($char1[$n1] eq $char2[$n2]) { # We have found a matching character. Is this the first matching character, or a # continuation of previous matching characters? If the former, then the length of # the previous matching portion is undefined; set to zero. $lc_suffix[$n1-1][$n2-1] ||= 0; # In either case, declare the match to be one character longer than the match of # characters preceding this character. $lc_suffix[$n1][$n2] = $lc_suffix[$n1-1][$n2-1] + 1; # If the resulting substring is longer than our previously recorded max length ... if ($lc_suffix[$n1][$n2] > $l_length) { # ... we record its length as our new max length ... $l_length = $lc_suffix[$n1][$n2]; # ... and clear our result list of shorter substrings. @substrings = (); } # If this substring is equal to our longest ... if ($lc_suffix[$n1][$n2] == $l_length) { # ... add it to our list of solutions. push @substrings, substr($str1, ($n1-$l_length), $l_length); } } } } return $substrings[0]; # return @substrings; } __END__ Input file sytnax: - lines starting with s= define the 1st input string - lines starting with t= define the 2nd input string - lines starting with m= define the test message (optional) - lines starting with e= define the expected output - lines starting with c= trigger the check - all other lines are ignored and can be used for comments - whitespace is significant, except for the trailing newline - limited to "printable" ascii: space to tilde #### # Basic tests # All input string pairs have at least one common substring s=abcde t=bcd77 e=bcd c= m= rt52839 Algorithm::LCSS bug s=CAGAGTTCTACAGTCCGACGATCACTAA t=ACCGACGATCACTATCGTACGACTCTTAGCAAGCAGA e=CCGACGATCACTA c= m= rt32036 String::LCSS bug s=1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 18 19 20 21 22 23 7 24 t=1 2 3 4 5 7 8 9 11 12 13 10 14 15 16 17 2 18 19 20 21 22 23 7 24 e= 14 15 16 17 2 18 19 20 21 22 23 7 24 c= m= rt62175 String::LCSS bug s=the quick brown fox jumped over the lazy dog t=I saw a quick brown fox and jumped over the lazy dog e= jumped over the lazy dog c= m= BrowserUk lcssN bug s=xxxyyxxy t=yyyxyxx e=yyx c= m= BrowserUk lcssN bug s=abcdefg t=abcdefga e=abcdefg c= m= Algorithm::LCSS cpan test.pl s=abcdefghijklmnopqrstuvwxyz t=flubberabcdubberdofghijklm e=fghijklm c= # String::LCSS cpan test (t/01strings.t) s=xyzzx t=abcxyzefg e=xyz c= s=abcxyzzx t=abcxyzefg e=abcxyz c= s=i pushed the lazy dog into a creek, the quick brown fox told me to t=the quick brown fox jumps over the lazy dog e=the quick brown fox c= m= reverse of above s and t s=the quick brown fox jumps over the lazy dog t=i pushed the lazy dog into a creek, the quick brown fox told me to e=the quick brown fox c= s=i pushed the lazy dog into a creek, the quick brown fox told me to t=why did the quick brown fox jumps over the lazy dog e= the quick brown fox c= # String::LCSS_XS cpan test (t/10.lcss.t) s=xyzzx t=abcxyzefg e=xyz c= s=abcxyzzx t=abcxyzefg e=abcxyz c= # LCSS_XS=a, japhy=f, browseruk=b m= 3 possible lcss depending on order: a b f s=foobar t=abcxyzefg e=a e=b e=f c= s=ABBAGGG t=HHHHZZAB e=AB c= m= reverse s and t from above s=HHHHZZAB t=ABBAGGG e=AB c= s=zyzxx t=abczyzefg e=zyz c= m= "bug"/"feature" in String::LCSS s=b t=ab e=b c= # end String::LCSS_XS cpan test (t/10.lcss.t) s=____ ___ t=dhfgdja___ 000ghakj e=___ c= s=111000111 t=10130000555 e=000 c= s=+=-_)(*&^%$#@!~`|\]}{["';:?/.><, zxcvbnm t= xcv----- e=xcv c= m= s=3space,t=7space,e=3space s= t= e= c= s=111 t=111 e=111 c= s=abc t=cba000 e=a e=b e=c c=