#!/usr/bin/perl
use strict;
use warnings;
my %sequences;
# read the input files, passing in a filename and
# a reference to the sequences hash
read_file("file1", \%sequences);
read_file("file2", \%sequences);
# write the output file, passing in a filename and
# a reference to the sequences hash
write_file("file3", \%sequences);
sub read_file
{
my $filename = shift;
my $seqref = shift;
open SEQFILE, "<$filename" or die "can't open $filename: $!";
# use greater-than for record separator
# see perldoc perlvar
local $/ = ">";
while(my $line = <SEQFILE>)
{
# the data will match a word, followed by a
# newline, followed by another word
# see perldoc perlretut and perlre
if ($line =~ m/(\w+)\n(\w+)/s)
{
# the first word captured is the name
my $name = $1;
# the second word captured is the sequence
my $sequence = $2;
# add the sequence on to what ever is
# already there
$seqref->{$name} .= $sequence;
}
}
close SEQFILE;
}
sub write_file
{
my $filename = shift;
my $seqref = shift;
open OUTPUT, ">$filename" or die "can't open $filename: $!";
# write the sequences hash to the output file
for my $key (sort keys %{$seqref})
{
print OUTPUT ">$key\n$seqref->{$key}\n"
or die "can't write: $!";
}
close OUTPUT or die "can't close $filename: $!";
}
__END__
| [reply] [Watch: Dir/Any] [d/l] |
use strict;
use Bio::SeqIO;
use Bio::Seq;
my %seqs;
my @files = qw(file1 file2);
for my $file ( @files ) {
my $in = Bio::SeqIO->new(-format => 'fasta', -file => $file);
while( my $seq = $in->next_seq ) {
$seqs{$seq->display_id} .= $seq->seq; # append the seqdata
}
}
my $out = Bio::SeqIO->new(-format => 'fasta', -file => '>newfile.fa');
while( my ($seqname,$seqstr) = each %seqs ) {
my $seq = Bio::Seq->new(-id => $seqname, -seq => $seqstr);
$out->write_seq($seq);
}
Of course if you wanted to have sequence data in another file format you just have to change the 'fasta' to something else like 'genbank', 'swiss', 'embl', etc... | [reply] [Watch: Dir/Any] [d/l] |
This is what I came up with:
#!/bin/perl5
use strict;
use warnings;
@ARGV = qw( file1.txt file2.txt );
my $name;
my %hash;
while (<>){
chomp;
my $record = $_;
if ( $record =~ /^>/ ){
$name = $record;
next;
}
else{
$hash{$name} .= $record
}
}
for my $key ( keys %hash ){
print "$key\n$hash{$key}\n"
}
Updated: Use hash instead of a hashref | [reply] [Watch: Dir/Any] [d/l] |
| [reply] [Watch: Dir/Any] [d/l] |
| [reply] [Watch: Dir/Any] |
Hi newperler.
I felt as though you might enjoy these two books:
- Title: Learning Perl (3rd edition)
- ISBN: 0596001320
- List Price: $34.95
- Title: Beginning Perl for Bioinformatics
- ISBN: 0596000804
- List Price: $39.95
and the following additional site:
bioperl.org
Hope this helps,
-Katie.
| [reply] [Watch: Dir/Any] |
Are you confident that the two files have the same number of lines, the same number and set of individual names, and the same number of sequences per individual? And, are the individuals presented in the same order? That is, are the two files already aligned?
If there is a lack of alignment between the two, then some more specification is needed about what to do to reconcile differences between the two files.
But if the files are fully aligned, you could use a trivial perl script to join the two files line-by-line (or use the time-honored "paste" utility that has been in /usr/bin/ on unix systems since forever):
#!/usr/bin/perl
open( F1, "file1" ) or die "file1: $!";
open( F2, "file2" ) or die "file2: $!";
while ( $s1 = <F1> ) {
$s2 = <F2>;
if ( $s1 =~ /^>/ ) {
print $s1;
} else {
chomp $s1;
print "$s1$s2"; # (update: added "$" on "s1")
}
}
or, using the "join" command and an even shorter perl script (which just removes duplicated individual names):
paste -d '' file1 file2 | perl -pe 's/(?<=\w)>.*//' > files.joined
(updates: mistakenly started with "join" command instead of "paste"; added the "-d ''" option on the paste command, to concatentate input strings with no separator. Regarding the (?<=\w)>.*, see the perlre man page about "zero-width look-behind assertion" -- this RE is checking for an angle bracket preceded by an alphanumeric, and replacing that angle bracket, and everything after it on the line, with an empty string. If a name ends in something other than a letter or digit, this will fail to remove the duplication.) | [reply] [Watch: Dir/Any] [d/l] [select] |