>perl -wMstrict -le "use List::MoreUtils qw(uniq); ;; my @seqs = qw(AGCT AGGT GG AGCT CTAG); ;; my $seen = ''; my $delim = ':'; ;; my @no_dups = grep { ($seen !~ m{$_}xms) && ($seen .= $delim . $_) } uniq @seqs ; print qq{'$_'} for @no_dups; " 'AGCT' 'AGGT' 'CTAG' #### use warnings; use strict; use List::MoreUtils qw(uniq); use Test::More 'no_plan'; use Test::NoWarnings; MAIN: { # main loop VECTOR: for my $ar_vector ( # test input expected output [ [qw(AGCT AGGT GG AGCT) ], [qw(AGCT AGGT)], ], [ [qw(A AA AAA AAAA AAAAA)], [qw(AAAAA)], ], [ [qw(AAAAA AAAA AAA AA A)], [qw(AAAAA)], ], [ [qw(ACGT CGTA GTAC TACG)], [qw(ACGT CGTA GTAC TACG)], ], [ [qw(A C G T T G C A T G C A G T A C)], [qw(A C G T)], ], [ [qw(A G AG AGCT AGGT CTAG N AAAAA TT CCC GG AGCT AGC GCT C T CT)], [qw(AGCT AGGT CTAG N AAAAA TT CCC)], ], [ [qw(A AG AGC AGCT AGCTA AGCTAG AGCTAGC AGCTAGCT AGCTAGCTA)], [qw(AGCTAGCTA)], ], [ [qw(AGCTAGCTA AGCTAGCT AGCTAGC AGCTAG AGCTA AGCT AGC AG A)], [qw(AGCTAGCTA)], ], ) { my ($ar_test, $ar_expect) = @$ar_vector; is_deeply [ filter($ar_test) ], $ar_expect; } # end for VECTOR } # end MAIN loop # subroutines ###################################################### sub filter { my ($ar_sequences, # ref. to array of sequences to be filtered ) = @_; # eliminate duplicate sequences of equal length. my @from_uniq = uniq @$ar_sequences; my $delim = ':'; # prepare filter for substring elimination. my ($uniq_longest_first) = map { qq{$delim$_$delim} } join $delim, sort { length($b) <=> length($a) } # longest strings first @from_uniq ; # eliminate sequences that are substrings of ANY other sequence. return grep { my $len = length $_; $uniq_longest_first =~ m{ $delim [^$delim]{$len} $delim # stop checking if == len | $_ (*COMMIT)(*FAIL) # MIS-match if any substring }xms; } @from_uniq ; } #### sub filter4 { my ($ar_sequences, # ref. to array of sequences to be filtered ) = @_; # eliminate duplicate sequences of equal length. # a sequence may still exist as a substring in a longer sequence. my @from_uniq = uniq @$ar_sequences; # separator string. must be distinct from anything in a sequence. my $sep = ':'; # string of unique sequences, sorted shortest to longest. my $uniq_short_to_long = join $sep, sort { length($a) <=> length($b) } @from_uniq ; # joined string must end with a separator string. $uniq_short_to_long .= $sep; # some convenience regexes. my $base = qr{ [^\Q$sep\E] }xms; $sep = qr{ \Q$sep\E }xms; # convert to regex object # build index of offsets of first position beyond each length. my %offset; LENGTH_GROUP: while ($uniq_short_to_long =~ m{ \G ($base+) $sep }xmsg) { # build regex for sequences of this length. my $n_bases = length $1; my $n_seq = qr{ (?:$base){$n_bases} $sep }xms; # find, save offset of 1st longer seq after these sequences. $uniq_short_to_long =~ m{ \G $n_seq* }xmsg; $offset{ $n_bases } = pos $uniq_short_to_long; } # end while LENGTH_GROUP # keep all sequences NOT substrings of any LONGER sequence. return grep { $[ > index $uniq_short_to_long, $_, $offset{length $_}; } @from_uniq ; } # end sub filter4()