The following is a parallel demonstration when extra performance is desired for very large sequences. Otherwise, the serial demonstration is faster.
use strict;
use warnings;
use BioUtil::Seq;
use constant { HDR => 0, SEQ => 1 };
use MCE::Flow;
use MCE::Shared;
mce_open my $out_fh, '>', \*STDOUT or die "open error: $!\n";
# From the documentation:
#
# FastaReader returns an anonymous subroutine, when called, returns
# a fasta record which is a reference of an array containing the fasta
# header and sequence. By default, spaces and \r?\n are trimmed from
# the sequence.
#
mce_flow {
max_workers => 4,
chunk_size => 1,
input_data => FastaReader("input_file.fasta")
},
sub {
my ( $mce, $chunk_ref, $chunk_id ) = @_;
my $fa = $chunk_ref->[0];
# my $fa = $_; # same thing for chunk_size => 1
# therefore, the 2 lines above may be omitted
# print ">$fa->[HDR]\n$fa->[SEQ]\n";
my $name = ( split(/ /, $fa->[HDR], 2) )[0];
my $output;
while ( $fa->[SEQ] =~ /(?<=(.....))abc(.{10})def(?=(.....))/g ) {
$output .= "$name: $1, $2, $3\n";
}
print $out_fh $output if length($output);
};
Regards, Mario.