http://qs321.pair.com?node_id=11122340


in reply to Re^2: substrings that consist of repeating characters
in thread substrings that consist of repeating characters

my $string = "AAATTTAGTTCTTAAGGCTGACATCGGTTTACGTCAGCGTTACCCCCCAAGTTATT +GGGGACTTT"; my %expect = qw( CCCCCC 1 GGGG 1 AAA 1 TTT 3 AA 2 GG 2 TT 5 ); my $n = shift // 1; if ($n > 1) { $string = $string x $n; $_ *= $n for values %expect; } use Test::More; use Benchmark qw(cmpthese); my %subs; my @v = map { "v$_" } 1 .. 8; my %f; @f{@v} = ( sub { %subs = (); $subs{$_}++ for grep { length >= 2 } split m/,/ => ($string =~ s/( +[ACGT])\K(?!\1)/,/gr); }, # v1 sub { %subs = (); $subs{$_}++ for grep m/^([ACGT])\1+$/ => split m/,/ => ($string =~ + s/(\w)\K(?!\1)/,/gr); }, # v2 sub { %subs = (); $subs{$_}++ for $string =~ m/(AA+|CC+|GG+|TT+)/g; }, # v3 sub { %subs = (); $subs{$1}++ while $string =~ m{(([ACGT])\2+)}g; }, # v4 sub { %subs = (); $subs{$&}++ while $string =~ m{([ACGT])\1+}g; }, # v5 sub { %subs = (); $subs{$&}++ while $string =~ m{A{2,}|C{2,}|G{2,}|T{2,}}g; }, # v6 sub { %subs = (); $subs{$&}++ while $string =~ m{AA+|CC+|GG+|TT+}g; }, # v7 sub { %subs = (); $subs{$&}++ while $string =~ m{()AA+|CC+|GG+|TT+}g; }, # v8 ); for (@v) { $f{$_}->(); is_deeply (\%subs, \%expect, $_); } printf "%5d %3d %s\n", $subs{$_->[1]}, @$_ for sort { $b->[0] <=> $a-> +[0] || $a->[1] cmp $b->[1] } map {[ length, $_ ]} keys %subs; cmpthese (-2, { map {( $_ => $f{$_} )} @v }); done_testing;
$ test.pl 1 ok 1 - v1 ok 2 - v2 ok 3 - v3 ok 4 - v4 ok 5 - v5 ok 6 - v6 ok 7 - v7 ok 8 - v8 1 6 CCCCCC 1 4 GGGG 1 3 AAA 3 3 TTT 2 2 AA 2 2 GG 5 2 TT Rate v2 v1 v7 v3 v4 v5 v6 v8 v2 41819/s -- -30% -45% -53% -57% -58% -60% -63% v1 60150/s 44% -- -21% -32% -38% -40% -43% -47% v7 76560/s 83% 27% -- -13% -22% -23% -28% -32% v3 88071/s 111% 46% 15% -- -10% -12% -17% -22% v4 97745/s 134% 63% 28% 11% -- -2% -8% -13% v5 99555/s 138% 66% 30% 13% 2% -- -6% -12% v6 105700/s 153% 76% 38% 20% 8% 6% -- -6% v8 112783/s 170% 88% 47% 28% 15% 13% 7% -- 1..8
$ test.pl 20 ok 1 - v1 ok 2 - v2 ok 3 - v3 ok 4 - v4 ok 5 - v5 ok 6 - v6 ok 7 - v7 ok 8 - v8 20 6 CCCCCC 20 4 GGGG 20 3 AAA 60 3 TTT 40 2 AA 40 2 GG 100 2 TT Rate v2 v1 v7 v3 v4 v5 v6 v8 v2 2327/s -- -29% -47% -52% -55% -57% -61% -65% v1 3284/s 41% -- -26% -32% -37% -39% -45% -50% v7 4419/s 90% 35% -- -9% -15% -17% -26% -33% v3 4853/s 109% 48% 10% -- -7% -9% -18% -27% v4 5215/s 124% 59% 18% 7% -- -3% -12% -21% v5 5351/s 130% 63% 21% 10% 3% -- -10% -19% v6 5934/s 155% 81% 34% 22% 14% 11% -- -10% v8 6604/s 184% 101% 49% 36% 27% 23% 11% -- 1..8
$ test.pl 2000 ok 1 - v1 ok 2 - v2 ok 3 - v3 ok 4 - v4 ok 5 - v5 ok 6 - v6 ok 7 - v7 ok 8 - v8 2000 6 CCCCCC 2000 4 GGGG 2000 3 AAA 6000 3 TTT 4000 2 AA 4000 2 GG 10000 2 TT Rate v2 v1 v7 v3 v4 v5 v6 v8 v2 21.3/s -- -35% -50% -54% -60% -61% -64% -68% v1 32.7/s 54% -- -23% -30% -38% -39% -45% -51% v7 42.6/s 100% 30% -- -9% -19% -21% -28% -36% v3 46.6/s 119% 42% 9% -- -12% -14% -21% -30% v4 52.7/s 147% 61% 24% 13% -- -2% -11% -21% v5 54.0/s 154% 65% 27% 16% 3% -- -9% -19% v6 59.2/s 178% 81% 39% 27% 13% 10% -- -11% v8 66.3/s 212% 103% 56% 42% 26% 23% 12% -- 1..8

Enjoy, Have FUN! H.Merijn