Test_duplicates.txt
>sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2
MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR
NLLLSTLQYLG
>sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2
MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR
NLLLSTLQYLG
poj_duplicate.pl
#!/usr/bin/perl
use strict;
use warnings;
my $report_name = 'sdAb_report.txt';
open my $out_file, '>', $report_name
or die "Cannot open '$report_name' because: $!";
print 'PLEASE ENTER THE FILENAME OF THE PROTEIN SEQUENCE: ';
chomp( my $prot_filename = <STDIN> );
open my $PROTFILE, '<', $prot_filename
or die "Cannot open '$prot_filename' because: $!";
$/ = ''; # Set paragraph mode
my @count=();
my %absent=();
my $name;
my %fasta_seen; # sequences seen so far
FASTA_RECORD:
while ( my $para = <$PROTFILE> ) {
# Remove fasta header line
if ( $para =~ s/^>(.*)//m ){
$name = $1;
};
# Remove comment line(s)
$para =~ s/^\s*#.*//mg;
# next FASTA_RECORD if $fasta_seen{ $para }++;
if ( $fasta_seen{ $para }++ ){
print "DUPLICATE : $name \n $para\n";
next FASTA_RECORD;
}
my %prot;
$para =~ s/(A-Z)/ ++$prot{ $1 } /eg;
my $num = scalar keys %prot;
push @count,$num,$name;
# printf "Counted %d for %s ..\n",$num,substr($name,0,50);
print $out_file "$name\n";
print $out_file join( ' ', map "$_=$prot{$_}", sort keys %prot ), "\n";
printf $out_file "Amino acid alphabet = %d\n\n",$num ;
# count absent
for ('A'..'Z'){
++$absent{$_} unless exists $prot{$_};
};
};
# sort names by count in ascending order to get lowest
my @sorted = sort { $a->[0] <=> $b->[0] } @count;
my $lowest = $sorted[0]->[0];
# maybe more than 1 lowest
printf $out_file "Least number of proteins is %d in these entries\n",$lowest;
my @lowest = grep { $_->[0] == $lowest } @sorted;
print $out_file "$_->1\n" for @lowest;
# show all results
print $out_file "\nAll results in ascending count\n";
for (@sorted){
printf $out_file "%d %s\n",@$_;
};
close $out_file;
print "\nResults are printed in $report_name\n";
# print absent counts
print "\nNon-incorporation of various amino acids in $prot_filename is as follows\n";
for (sort keys %absent){
printf "%s=%d\n",$_,$absent{$_};
};
|
|