Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

davi54's scratchpad

by davi54 (Sexton)
on Feb 21, 2019 at 21:30 UTC ( [id://1230332]=scratchpad: print w/replies, xml ) Need Help??

Test_duplicates.txt
>sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2 MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR NLLLSTLQYLG >sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2 MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR NLLLSTLQYLG


poj_duplicate.pl
#!/usr/bin/perl use strict; use warnings; my $report_name = 'sdAb_report.txt'; open my $out_file, '>', $report_name or die "Cannot open '$report_name' because: $!"; print 'PLEASE ENTER THE FILENAME OF THE PROTEIN SEQUENCE: '; chomp( my $prot_filename = <STDIN> ); open my $PROTFILE, '<', $prot_filename or die "Cannot open '$prot_filename' because: $!"; $/ = ''; # Set paragraph mode my @count=(); my %absent=(); my $name; my %fasta_seen; # sequences seen so far FASTA_RECORD: while ( my $para = <$PROTFILE> ) { # Remove fasta header line if ( $para =~ s/^>(.*)//m ){ $name = $1; }; # Remove comment line(s) $para =~ s/^\s*#.*//mg; # next FASTA_RECORD if $fasta_seen{ $para }++; if ( $fasta_seen{ $para }++ ){ print "DUPLICATE : $name \n $para\n"; next FASTA_RECORD; } my %prot; $para =~ s/(A-Z)/ ++$prot{ $1 } /eg; my $num = scalar keys %prot; push @count,$num,$name; # printf "Counted %d for %s ..\n",$num,substr($name,0,50); print $out_file "$name\n"; print $out_file join( ' ', map "$_=$prot{$_}", sort keys %prot ), "\n"; printf $out_file "Amino acid alphabet = %d\n\n",$num ; # count absent for ('A'..'Z'){ ++$absent{$_} unless exists $prot{$_}; }; }; # sort names by count in ascending order to get lowest my @sorted = sort { $a->[0] <=> $b->[0] } @count; my $lowest = $sorted[0]->[0]; # maybe more than 1 lowest printf $out_file "Least number of proteins is %d in these entries\n",$lowest; my @lowest = grep { $_->[0] == $lowest } @sorted; print $out_file "$_->1\n" for @lowest; # show all results print $out_file "\nAll results in ascending count\n"; for (@sorted){ printf $out_file "%d %s\n",@$_; }; close $out_file; print "\nResults are printed in $report_name\n"; # print absent counts print "\nNon-incorporation of various amino acids in $prot_filename is as follows\n"; for (sort keys %absent){ printf "%s=%d\n",$_,$absent{$_}; };
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others scrutinizing the Monastery: (4)
As of 2024-04-19 14:50 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found