c:\test>608174-b -N=500 -LENGTH=3 (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN BH2 L~R Buk TZ 9.06e-002/s -- -98% -98% -99% -99% -99% -100% Eric 4.15/s 4481% -- -19% -41% -63% -75% -85% BH1 5.12/s 5548% 23% -- -27% -55% -70% -82% MN 7.02/s 7642% 69% 37% -- -38% -58% -75% BH2 11.3/s 12354% 172% 121% 61% -- -33% -59% L~R 16.9/s 18532% 307% 230% 141% 50% -- -39% Buk 27.7/s 30457% 567% 441% 295% 145% 64% -- ---- BH2 => 14 L~R => 14 Eric => 31 Buk => 14 BH1 => 14 TZ => 393262 MN => 14 c:\test>608174-b -N=500 -LENGTH=4 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.01e-002/s -- -97% -98% -98% -99% -99% -100% Eric 2.87/s 3079% -- -40% -52% -83% -84% -88% BH1 4.78/s 5198% 67% -- -20% -71% -73% -80% MN 6.00/s 6556% 109% 26% -- -64% -65% -75% L~R 16.5/s 18153% 474% 245% 174% -- -5% -33% BH2 17.4/s 19167% 506% 264% 189% 6% -- -29% Buk 24.5/s 27060% 754% 413% 308% 49% 41% -- ---- BH2 => 5 L~R => 5 Eric => 34 Buk => 5 BH1 => 5 TZ => 396184 MN => 5 c:\test>608174-b -N=500 -LENGTH=5 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.10e-002/s -- -96% -98% -98% -99% -100% -100% Eric 2.23/s 2352% -- -54% -63% -86% -90% -91% BH1 4.85/s 5227% 117% -- -20% -70% -77% -80% MN 6.06/s 6551% 171% 25% -- -63% -72% -75% L~R 16.4/s 17953% 636% 239% 171% -- -23% -33% BH2 21.3/s 23338% 856% 340% 252% 30% -- -13% Buk 24.5/s 26791% 997% 405% 304% 49% 15% -- ---- BH2 => 1 L~R => 1 Eric => 35 Buk => 1 BH1 => 1 TZ => 387220 MN => 1 c:\test>608174-b -N=500 -LENGTH=6 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.04e-002/s -- -95% -98% -99% -99% -100% -100% Eric 1.73/s 1814% -- -65% -72% -90% -92% -93% BH1 4.92/s 5344% 184% -- -20% -70% -79% -81% MN 6.13/s 6687% 255% 25% -- -63% -73% -76% L~R 16.7/s 18321% 862% 238% 171% -- -27% -34% BH2 22.9/s 25257% 1225% 366% 274% 38% -- -9% Buk 25.2/s 27829% 1359% 413% 312% 52% 10% -- ---- BH2 => 0 L~R => 0 Eric => 35 Buk => 0 BH1 => 0 TZ => 391688 MN => 0 c:\test>608174-b -N=500 -LENGTH=7 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.08e-002/s -- -94% -98% -99% -99% -100% -100% Eric 1.47/s 1521% -- -70% -76% -91% -94% -94% BH1 4.93/s 5327% 235% -- -21% -71% -80% -81% MN 6.22/s 6748% 322% 26% -- -63% -74% -76% L~R 16.9/s 18554% 1051% 244% 172% -- -30% -33% BH2 24.1/s 26469% 1539% 390% 288% 42% -- -5% Buk 25.4/s 27899% 1627% 416% 309% 50% 5% -- ---- BH2 => 0 L~R => 0 Eric => 32 Buk => 0 BH1 => 0 TZ => 393468 MN => 0 ##

##

#! perl -slw
use strict;
use Benchmark qw[ cmpthese ];

sub nCommonSubstrLenL {
    my( $haystack, $needle, $len ) = @_;
    ( $haystack, $needle ) = ( $needle, $haystack ) if length( $haystack ) < length( $needle );
    my $count = 0;
    my %possibles;
    for my $ni ( 0 .. length( $needle ) - $len ) {
        next if ++$possibles{ substr( $needle, $ni, $len ) } > 1;
        ++$count if 1+index $haystack, substr( $needle, $ni, $len );
    }
    return $count;
}

{
    my %seen;
    sub LR_common_substr {
        my ($str1, $str2, $len_subs) = @_;
        my ( $len_str1, $len_str2 ) = map length, $str1, $str2;
        my $temp1 = exists $seen{$len_str1}{$len_subs}
                  ? $seen{$len_str1}{$len_subs}
                  : ($seen{$len_str1}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str1 - $len_subs + 1));
        my %substr;
        @substr{ unpack($temp1, $str1) } = ();
        my $temp2 = exists $seen{$len_str2}{$len_subs}
                  ? $seen{$len_str2}{$len_subs}
                  : ($seen{$len_str2}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str2 - $len_subs + 1));
        my $count = keys %substr;
        delete @substr{ unpack($temp1, $str2) };
        return $count - keys %substr;
    }
}

{
    my @matches;
    my $push = qr/(?{ push @matches, $1 })/;

    sub match_all_ways1 {
      my ($string, $regex) = @_;
      @matches = ();
      $string =~ m/($regex)$push(?!)/;
      return @matches;
    }
}
sub BH_common_substr1 {
    my ($str1, $str2, $len) = @_;

    my %substr = map { $_ => 1 } match_all_ways1($str1 => qr/.{$len}/);
    $substr{$_} |= 2 for match_all_ways1($str2 => qr/.{$len}/);

    return grep { $substr{$_} == 3 } keys %substr;
}

{
    my @matches;
    my $push = qr/(?{ push @matches, $1 })/;

    sub match_all_ways2 {
      my ($string, $regex) = @_;
      @matches = ();
      $string =~ m/$regex$push(?!)/;
      return @matches;
    }
}
sub BH_common_substr2 {
    my ($str1, $str2, $len) = @_;
    my %subs;
    @subs{  match_all_ways2("$str1\0$str2" => qr/(.{$len}).*\0.*\1/)  } = ();
    return keys %subs;
}

{
    my $hits = 0;
    sub TZ_common_substr1 {
      my($s1, $s2) = @_;

    #  print qq(s1 = $s1, s2 = $s2\n);

      ($s1, $s2) = ($s2, $s1) if length($s2) < length($s1);
      if ($s1 eq $s2) {
        $hits++;
        return if length($s1) == 1;
      }
      my %hash = map { $_ => 1 } split(//, $s1);
      my $arr = [];
      for my $s (split(//, $s2)) {
        push(@$arr, $s) if ! exists($hash{$s});
      }
      my $splitters = join('|', @$arr);
      for my $s (split(/$splitters/, $s2)) {
        TZ_common_substr1($s, $s1);
      }
    }
    sub TZ_common_substr{ &TZ_common_substr1; return $hits }
}

sub Eric_common_sub {
    my ($s1,$s2,$len) = @_;

    my $len_s1 = length($s1);
    my $len_s2 = length($s2);

    my $match_s1 = {};
    my $match_s2 = {};

    for my $start (0..length($s1)-1) {
        for my $l (1..$len) {
            next if $start+$l > $len_s1;
            $match_s1->{substr($s1, $start, $l)} ||= 1;
        }
    }

    for my $start (0..length($s2)-1) {
        for my $l (1..$len) {
            next if $start+$l > $len_s2;
            $match_s2->{substr($s2, $start, $l)} ||= 1;
        }
    }
    $match_s1->{$_}++ for keys %$match_s2;
    return grep { $match_s1->{$_} == 2 } keys %$match_s1;
}

sub MN_CountSubstrings { # $string1, $string2, $substr_length
    my $l = pop @_;
    my %found = ();
    my %match = ();
    my $first = 2;
    my $string1 = $_[0];
    for my $string ( @_ ) {
        $first --;
        my $ls = length( $string );
        my $limit = $ls - $l + 1;
        for ( my $i = 0; $i < $limit; $i++ ) {
            my $sbstr = substr( $string, $i, $l );
            $first or defined ( $match{$sbstr} ) && next();
            $found{ $sbstr }{ $string } ||= 1;
            $first and next;;
            defined ( $found{ $sbstr }{ $string1 } )
              and $match{ $sbstr } = 1;
        }
    }
    return scalar keys %match;
}

sub rndStr{ join'', @_[ map{ rand @_ } 1 .. shift ] }

our $N ||= 50;
our $LENGTH ||= 3;
our @strings = map{ rndStr int( 30 +rand 20 ), 'A' .. 'D' } 1 .. $N;
our %results;

my %tests = (
    'L~R'   => q[ $results{ 'L~R' } = LR_common_substr(   @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    Buk     => q[ $results{ Buk   } = nCommonSubstrLenL(  @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    BH1     => q[ $results{ BH1   } = BH_common_substr1(  @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    BH2     => q[ $results{ BH2   } = BH_common_substr2(  @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    TZ      => q[ $results{ TZ    } = TZ_common_substr(   @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    Eric    => q[ $results{ Eric  } = Eric_common_sub(    @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
    MN      => q[ $results{ MN    } = MN_CountSubstrings( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
);

cmpthese -1, \%tests;

print "\n----\n";
print "$_ => $results{ $_ }" for keys %results;