c:\test>608174-b -N=500 -LENGTH=3 (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN BH2 L~R Buk TZ 9.06e-002/s -- -98% -98% -99% -99% -99% -100% Eric 4.15/s 4481% -- -19% -41% -63% -75% -85% BH1 5.12/s 5548% 23% -- -27% -55% -70% -82% MN 7.02/s 7642% 69% 37% -- -38% -58% -75% BH2 11.3/s 12354% 172% 121% 61% -- -33% -59% L~R 16.9/s 18532% 307% 230% 141% 50% -- -39% Buk 27.7/s 30457% 567% 441% 295% 145% 64% -- ---- BH2 => 14 L~R => 14 Eric => 31 Buk => 14 BH1 => 14 TZ => 393262 MN => 14 c:\test>608174-b -N=500 -LENGTH=4 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.01e-002/s -- -97% -98% -98% -99% -99% -100% Eric 2.87/s 3079% -- -40% -52% -83% -84% -88% BH1 4.78/s 5198% 67% -- -20% -71% -73% -80% MN 6.00/s 6556% 109% 26% -- -64% -65% -75% L~R 16.5/s 18153% 474% 245% 174% -- -5% -33% BH2 17.4/s 19167% 506% 264% 189% 6% -- -29% Buk 24.5/s 27060% 754% 413% 308% 49% 41% -- ---- BH2 => 5 L~R => 5 Eric => 34 Buk => 5 BH1 => 5 TZ => 396184 MN => 5 c:\test>608174-b -N=500 -LENGTH=5 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.10e-002/s -- -96% -98% -98% -99% -100% -100% Eric 2.23/s 2352% -- -54% -63% -86% -90% -91% BH1 4.85/s 5227% 117% -- -20% -70% -77% -80% MN 6.06/s 6551% 171% 25% -- -63% -72% -75% L~R 16.4/s 17953% 636% 239% 171% -- -23% -33% BH2 21.3/s 23338% 856% 340% 252% 30% -- -13% Buk 24.5/s 26791% 997% 405% 304% 49% 15% -- ---- BH2 => 1 L~R => 1 Eric => 35 Buk => 1 BH1 => 1 TZ => 387220 MN => 1 c:\test>608174-b -N=500 -LENGTH=6 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.04e-002/s -- -95% -98% -99% -99% -100% -100% Eric 1.73/s 1814% -- -65% -72% -90% -92% -93% BH1 4.92/s 5344% 184% -- -20% -70% -79% -81% MN 6.13/s 6687% 255% 25% -- -63% -73% -76% L~R 16.7/s 18321% 862% 238% 171% -- -27% -34% BH2 22.9/s 25257% 1225% 366% 274% 38% -- -9% Buk 25.2/s 27829% 1359% 413% 312% 52% 10% -- ---- BH2 => 0 L~R => 0 Eric => 35 Buk => 0 BH1 => 0 TZ => 391688 MN => 0 c:\test>608174-b -N=500 -LENGTH=7 (warning: too few iterations for a reliable count) (warning: too few iterations for a reliable count) Rate TZ Eric BH1 MN L~R BH2 Buk TZ 9.08e-002/s -- -94% -98% -99% -99% -100% -100% Eric 1.47/s 1521% -- -70% -76% -91% -94% -94% BH1 4.93/s 5327% 235% -- -21% -71% -80% -81% MN 6.22/s 6748% 322% 26% -- -63% -74% -76% L~R 16.9/s 18554% 1051% 244% 172% -- -30% -33% BH2 24.1/s 26469% 1539% 390% 288% 42% -- -5% Buk 25.4/s 27899% 1627% 416% 309% 50% 5% -- ---- BH2 => 0 L~R => 0 Eric => 32 Buk => 0 BH1 => 0 TZ => 393468 MN => 0 #### #! perl -slw use strict; use Benchmark qw[ cmpthese ]; sub nCommonSubstrLenL { my( $haystack, $needle, $len ) = @_; ( $haystack, $needle ) = ( $needle, $haystack ) if length( $haystack ) < length( $needle ); my $count = 0; my %possibles; for my $ni ( 0 .. length( $needle ) - $len ) { next if ++$possibles{ substr( $needle, $ni, $len ) } > 1; ++$count if 1+index $haystack, substr( $needle, $ni, $len ); } return $count; } { my %seen; sub LR_common_substr { my ($str1, $str2, $len_subs) = @_; my ( $len_str1, $len_str2 ) = map length, $str1, $str2; my $temp1 = exists $seen{$len_str1}{$len_subs} ? $seen{$len_str1}{$len_subs} : ($seen{$len_str1}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str1 - $len_subs + 1)); my %substr; @substr{ unpack($temp1, $str1) } = (); my $temp2 = exists $seen{$len_str2}{$len_subs} ? $seen{$len_str2}{$len_subs} : ($seen{$len_str2}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str2 - $len_subs + 1)); my $count = keys %substr; delete @substr{ unpack($temp1, $str2) }; return $count - keys %substr; } } { my @matches; my $push = qr/(?{ push @matches, $1 })/; sub match_all_ways1 { my ($string, $regex) = @_; @matches = (); $string =~ m/($regex)$push(?!)/; return @matches; } } sub BH_common_substr1 { my ($str1, $str2, $len) = @_; my %substr = map { $_ => 1 } match_all_ways1($str1 => qr/.{$len}/); $substr{$_} |= 2 for match_all_ways1($str2 => qr/.{$len}/); return grep { $substr{$_} == 3 } keys %substr; } { my @matches; my $push = qr/(?{ push @matches, $1 })/; sub match_all_ways2 { my ($string, $regex) = @_; @matches = (); $string =~ m/$regex$push(?!)/; return @matches; } } sub BH_common_substr2 { my ($str1, $str2, $len) = @_; my %subs; @subs{ match_all_ways2("$str1\0$str2" => qr/(.{$len}).*\0.*\1/) } = (); return keys %subs; } { my $hits = 0; sub TZ_common_substr1 { my($s1, $s2) = @_; # print qq(s1 = $s1, s2 = $s2\n); ($s1, $s2) = ($s2, $s1) if length($s2) < length($s1); if ($s1 eq $s2) { $hits++; return if length($s1) == 1; } my %hash = map { $_ => 1 } split(//, $s1); my $arr = []; for my $s (split(//, $s2)) { push(@$arr, $s) if ! exists($hash{$s}); } my $splitters = join('|', @$arr); for my $s (split(/$splitters/, $s2)) { TZ_common_substr1($s, $s1); } } sub TZ_common_substr{ &TZ_common_substr1; return $hits } } sub Eric_common_sub { my ($s1,$s2,$len) = @_; my $len_s1 = length($s1); my $len_s2 = length($s2); my $match_s1 = {}; my $match_s2 = {}; for my $start (0..length($s1)-1) { for my $l (1..$len) { next if $start+$l > $len_s1; $match_s1->{substr($s1, $start, $l)} ||= 1; } } for my $start (0..length($s2)-1) { for my $l (1..$len) { next if $start+$l > $len_s2; $match_s2->{substr($s2, $start, $l)} ||= 1; } } $match_s1->{$_}++ for keys %$match_s2; return grep { $match_s1->{$_} == 2 } keys %$match_s1; } sub MN_CountSubstrings { # $string1, $string2, $substr_length my $l = pop @_; my %found = (); my %match = (); my $first = 2; my $string1 = $_[0]; for my $string ( @_ ) { $first --; my $ls = length( $string ); my $limit = $ls - $l + 1; for ( my $i = 0; $i < $limit; $i++ ) { my $sbstr = substr( $string, $i, $l ); $first or defined ( $match{$sbstr} ) && next(); $found{ $sbstr }{ $string } ||= 1; $first and next;; defined ( $found{ $sbstr }{ $string1 } ) and $match{ $sbstr } = 1; } } return scalar keys %match; } sub rndStr{ join'', @_[ map{ rand @_ } 1 .. shift ] } our $N ||= 50; our $LENGTH ||= 3; our @strings = map{ rndStr int( 30 +rand 20 ), 'A' .. 'D' } 1 .. $N; our %results; my %tests = ( 'L~R' => q[ $results{ 'L~R' } = LR_common_substr( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], Buk => q[ $results{ Buk } = nCommonSubstrLenL( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], BH1 => q[ $results{ BH1 } = BH_common_substr1( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], BH2 => q[ $results{ BH2 } = BH_common_substr2( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], TZ => q[ $results{ TZ } = TZ_common_substr( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], Eric => q[ $results{ Eric } = Eric_common_sub( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], MN => q[ $results{ MN } = MN_CountSubstrings( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], ); cmpthese -1, \%tests; print "\n----\n"; print "$_ => $results{ $_ }" for keys %results;