The CPAN module
Regexp::MatchContext was written for this task. The module's SYNOPSIS:
use Regexp::MatchContext -vars;
$str = m/(?p) \d+ /;
print "Before: $PREMATCH\n";
print "Matched: $MATCH\n";
print "After: $POSTMATCH\n";
$MATCH = 2 * $MATCH; # substitute into original $str
Note that this and the previous solutions are significantly slower than using the matchvariables
&`, $& and
$'. However, as
tye mentioned, these variables will slow down EVERY other regular expression without capturing parentheses.
The following benchmark (searching a short (11 characters or base pairs) DNA sequence in a 2000 bp DNA sequence) shows the results of a comparison of all four solutions:
Rate regex context at_minus matchvars
regex 17271/s -- -22% -66% -84%
context 22239/s 29% -- -56% -79%
at_minus 50420/s 192% 127% -- -53%
matchvars 107527/s 523% 384% 113% --
Note that this benchmark uses match variables and thus slows down all four solutions. The results without the match variable solution are:
Rate regex context at_minus
regex 17544/s -- -24% -69%
context 23112/s 32% -- -60%
at_minus 57361/s 227% 148% --
Appendix: Source code of the benchmark
#!/usr/bin/perl
use strict;
use warnings;
use Benchmark qw(:all);
use Regexp::MatchContext;
my $count = 300000;
# to test that all solutions produce the same output
my $VERBOSE = 0;
$count = 2 if $VERBOSE;
my $seq
= 'GGGTTGAAGTTTAGACCGCTCACAGTAGTTCTACCTATAGAAAAGATCATGAAAGAGGCGATC
+AGAATGGTACTCGAATCCATTTACGATCCCGAGTTTCCAGACACATCGCATTTCCGCTCGGGTCAAGGC
+TGCCACTCGGTCCTAAGACGGATCAAAGAAGAGTGGGGAATCTCTCGCTGGTTTTTAGAATTCGACATC
+AGGAAGTGTTTTCACACCATCGACCGACATCGACTCATCCAAATTTTGAAGGAAGAGATCGACGATCCC
+AAGTTCTTTTACTCCATTCAGAAAGTATTTTCCGCCGGACGACTCGTAGGAGTTGAGAGGGGCCCTTAC
+TCCGTCCCACACAGTGTACTACTATCGGCCCTACCAGGCAACATCTACCTACACAAGCTCGATCAGGAG
+ATAGGGAGGATCCGACAGAAGTACGAAATTCCGATTGTTCAGAGAGTCAGATCGGTTCTATTAAGGACA
+GGTCGTCGTATTGATGACCAAGAAAACCCTGGAGAAGAAGCAAGCTTCAACGCTCCCCAAGACAACAGA
+GCCATCATTGTGGGGAGCGTTAAGAGCATGCAACGCAAAGCGGCCTTTCATTCCCTTGTTTCGTCGTGG
+CACACCCCCCCCACAAGCACCCTCCGGCTCAGGGGGGACCAGAAAAGGCCTTTCGTTTTCCCCCCTTCG
+TCGGCCCTTGCCGTCTTCCTTAACAAGCCCTCGAGCCTTCTTTGCGCCGCCTTCCTCATAGAAGCCGCC
+GGGTTGACCCCGAAGGCTGAATTCTATGGTGGAGAACGCTGTAATAATAATTGGGCCATGAGAGACCTT
+CTTAAGTATTGCAAAAGAAAGGGCCTGCTGATAGAGCTGGGCGGGGAGGCGATACTAGTTATCAGGTCA
+GAGAGAGGCCTGGCCCGTAAGCAGGCCCCCTTAAAAACCCATTACTTAATAAGGATTTGTTACGCGCGA
+TATGCCGACGACTTACTACTGGGAATCGTGGGTGCCGTAGAGCTTCTCATAGAAATACAAAAACGTATC
+GCCCATTTCCTACAATCTGGCCTGAACCTTTGGGTAGGCTCCGCAGGATCAACAACAATAGCTGCACGG
+AGTACGGTAGAATTCCTTGGTACGGTCATTCGGGAAGTCCCTCCGAGGACGACTCCCATACAATTTTTG
+CGAGAGCTGGAAAAGCGTCTACGGGTAAAGCACCGTATCCATATAACTGCTTGCCACCTACGCTCCGCC
+ATCCATTCAAAGTTTAGGAACCTAGGTGATAGTATCCCGATCAAACAGCTGACGAAGGGGATGAGCAAA
+ACAGGGAGTCTACAGGACGGGGTTCAACTAGCGGAGACTCTTGGAACAGCTGGAGTCAGAAGTCCCCAA
+GTTAGCGTATTATGGGGGACCGTCAAGCACATCCGGCAAGGATCAAGGGGGATCTCGTTCTTGCATAGC
+TCAGGTCGGAGCAACGCGTCATCGGACGTTCAACAGGTAGTCTCACGATCGGGCACTCATGCCCGTAAG
+TTGTCATTGTATACTCCCCCGGGTCGGAAGGCGGCGGGGGAGGGAGGAGGACACTGGGCGGGATCTATC
+AGCAGCGAATTCCCCATAAAGATAGAGGCACCTATAAAAAAGATACTCCGAAGGCTTCGGGATCGAGGT
+ATCATTAGCCGAAGAAGACCCTGGCCAATCCACGTGGCCTGTTTGACGAACGTCAGCGACGAAGACATC
+GTAAATTGGTCCGCGGGCATCGCGATAAGTCCTCTGTCCTACTACAGGTGCCGCGACAACCTTTATCAA
+GTCCGAACGATTGTCGACCACCAGATTCGCTGGTCTGCAATATTCACCCTAGCCCACAAGCACAAATCC
+TCGGCGCCGAATATAATCCTCAAGTACTCCAAAGACTCAAATATTGTAAATCAAGAAGGTGGCAAGATC
+CTTGCAGAGTTCCCCAACAGCATAGAGCTTGGGAAGCTCGGACCCGGTCAAGACCTGAACAAGAAGGAA
+CACTCAACTACTAGTCTAGTCTAG';
cmpthese(
$count,
{ 'regex' => sub {
my ( $prematch, $match, $postmatch )
= $seq =~ m{(\A .*?) (CTGGCCCGTAA) (.*\z) }xms;
warn "$prematch $match $postmatch" if $VERBOSE;
},
'matchvars' => sub {
$seq =~ m{CTGGCCCGTAA}xms;
my ($prematch, $match, $postmatch) = ($`, $&, $');
warn "$prematch $match $postmatch" if $VERBOSE;
},
'context' => sub {
$seq =~ m{(?p)CTGGCCCGTAA}xms;
my ( $prematch, $match, $postmatch )
= ( PREMATCH(), MATCH(), POSTMATCH() );
warn "$prematch $match $postmatch" if $VERBOSE;
},
'at_minus' => sub {
$seq =~ m{CTGGCCCGTAA}xms;
my $prematch = substr( $seq, 0, $-[0] );
my $match = substr( $seq, $-[$#-], $+[$#-] - $-[$#-] );
my $postmatch = substr( $seq, $+[$#+] );
warn "$prematch $match $postmatch" if $VERBOSE;
},
}
);