Hi everyone,
I have a problem in executing the perl script (found online) is given below, a script t0 compare 2 files
1) a file with seq IDs and its weight
2) a file with seq IDs and the sequences.
I modified the original script a bit and tried to use the code with my data,but it neither prints out the output nor gives me no errors and further I want to add the weights in the file 1 to the sequence ID after comparing and extracting the respective reads.
use strict;
use warnings;
my $idsfile = "sample_IDs.txt";
my $seqfile = "sample_reads.fasta";
my %ids = ();
open FILE, $idsfile;
while(<FILE>) {
chomp;
$ids{$_} += 1;
}
close FILE;
local $/ = "\n>"; # read by FASTA record
open FASTA, $seqfile;
while (<FASTA>) {
chomp;
my $seq = $_;
my ($id) = $seq =~ /^>*(\S+)/; # parse ID as first word in FASTA
+header
if (exists($ids{$id})) {
$seq =~ s/^>*.+\n//; # remove FASTA header
$seq =~ s/\n//g; # remove endlines
print "$seq\n";
}
}
close FASTA;
sample_ID.txt
>comp10000_c0_seq1 15
>comp10001_c0_seq1 79
>comp10002_c0_seq1 7521
>comp10003_c0_seq1 41
>comp10004_c0_seq1 25
>comp10005_c0_seq1 96
>comp10006_c0_seq1 84
>comp10007_c0_seq1 63
>comp10008_c0_seq1 19
>comp1000_c0_seq1 35
>comp10010_c0_seq1 44
>comp10011_c0_seq1 32
>comp10012_c0_seq1 451
>comp10014_c0_seq1 6845
>comp10015_c0_seq1 521
>comp10016_c0_seq1 254
>comp10017_c0_seq1 51
>comp10018_c0_seq1 4512
>comp10019_c0_seq1 64
sample_reads.fasta
>comp10000_c0_seq0 len=159 path=[12:0-66 885:67-79 1106:80-158]
GGTTAATATTCCCGAGCCACGAGATTGGAGGGACGGCGTCCAGAGCACCTGCGGACTGAT
AGAATAGCCCGTTGGAGGTACTGAGTTGGAGAGGATTAAAAGACTCTCATAATACAAGGC
CCGTACTAAGCCCACTTACGATGGAATAGATGGTAGGAA
>comp10001_c0_seq2 len=100 path=[1:0-60 177:61-65 600:66-99]
ATCCCGCACGATTCCTGGAAACACTATCTCACCCCCAAAAGTGAAGAACCGTATAGAAAC
TCGAATACCTTGGTTTCGAGTACATCTTGTGCTCTTGAAT
>comp10002_c0_seq3 len=99 path=[2446:0-34 1163:35-98]
TTTTTTGTGATATATTAAATAATATATAAAAATACTATGGCAGGAAGTTTAAATAAAGTC
TTATTAATAGGCCGTTTAGGCGCAGACCCAGATATAAAA </p>
>comp10003_c0_seq1 len=166 path=[748:0-22 1004:23-46 2527:47-165]
AAGTAGCCTATGCGCTACAGTAAGAAAGACAGGTGAAAAAATGGAAGTAAAACAATTAGA
TGACTACTTTGGATATACAGAAAAGGGCAGTTCCTTAGAGGGGGAATTACGAGCAGGACT
AACGACATTCTTGACAATGGCGTACATTCTGTTTGTGAACCCAGAC
>comp10004_c0_seq1 len=143 path=[2167:0-44 2322:45-68 2508:69-142]
AATCTTTAATTTAAACTTAAAAAAAATTAACTTTTGAAAGGAATTAAAATGGAAAAAGAA
ATGTTAGTAGTAGCTAAATTAAAAGAAGGTACATTTGAAAAATTTATGGGTTTCATGCAA
TCGCCTGAAGGTTTAGCAGAAAG
>comp10005_c0_seq1 len=135 path=[2666:0-71 4268:72-134]
AATATTACCAGAAGTTACAGGTGATGTGACTTATTTACATTGCTTCGGTGAGTGTTCAGG
TGATGGTACAGGTGAATGCCCAAGTGGCGCTGTAACATGGATGCTTACAATGACTGTAAA
TACTGCTAATATCAC
>comp10006_c0_seq1 len=116 path=[4850:0-56 4046:57-115]
ATATAAGGTAGAAAATTACATTAACTATCTTTATCTATTTTATCACATTAGATATACAAA
CTTTTCCTTACAGAGAGGTCAAAAAATATGGAAGGAACTTTACTCTTAGCTAAATT
>comp10007_c0_seq1
CCGGGTTAGTCCGAATTTATTTTTTATATAAGGTAGATTTACAAAGTTATATGATTTTTT
TAATAGCCGCAATAGCACTCTTCATCTGGGTTGCATACACCGCAATGGAATTAAGAAAAG
CAACGCT
>comp10008_c0_seq1 len=298 path=[4908:0-189 5729:190-297]
ATTTTGATTTCAGAAAGATGTGTAACTGATGACGTGTTTACTTCAGTTCATATTGAAGAA
TATGAAATAATGGCAAGAGACACTAAACTAGGTGAAGAAGAAATTACCAGAGATATTCCT
AATGTTAATGAAGAAGCTTTAAAAAATCTTGATGAATCAGGGATAGTTTACATTGGAGCT
GAAGTAAACGCTGGTGATATTTTAGTTGGGAAAGTGACACCCAAAGGCGAGTCGCCAATG
ACCCCAGAAGAAAAATTGCTTCGAGCAATTTTTGGAGAGAAAGCTTCTGATGTAAGGG
>comp1000_c0_seq1 len=120 path=[5871:0-53 6857:54-119]
ATTATAACAAGAGGAGTGCACATGAGAAAAATGATATCATTGATTTCTGCGGTGCTAGTA
ATGTTTGCTTTTTCAAGCTCTATTACTTCAATCGCGAAATCAGCAGAGTTCTTTACAATA
>comp10010_c0_seq1 len=108 path=[7438:0-30 7277:31-107]
ATTCAGACTTTCTAACCAGACAGATGATCTAGATTGGATCGTTGGTCTTTTCTACCAAGA
AAGAGAAGAAGGTTGGGATTTCGAAACGGAAACCCACAACTACAGAAA
>comp10011_c0_seq1 len=105 path=[7198:0-63 8366:64-81 10234:82-104]
ATTGTTAAGTTTGTGTTTTTATGAATTGAAAATAAGAACTACAAACACGCTATTGAAAAT
TGTAAATATGTTTTCGGCTTTCATCGATTAATCAGAAGGTGATTG
>comp10012_c0_seq1 len=134 path=[8117:0-30 7737:31-54 7822:55-133]
ATATGTTTTTTTTATTAACACTTAAATTAACTAAATGGTACGATTCTTGAATAGATGATC
GAAATAGTTGGAGTTTTGTAATGGATTTAGCCTCCCTAATTGGACTTGTTGGTGCAGTCG
GTATGATTATAGCA
>comp10014_c0_seq1en=175 path=[7957:0-85 8425:86-174]
CTCGCGGTCTTTCAGAGGATGCTGAGAAATTCCTAGAGAAGGTTCGTGCGAATGGTGGCT
CCAAGATGATTAAGACACACTGCCGAAGTATGTACGTATTACCAGAGATGGTCGGTACTA
>comp10015_c0_seq1 len=122 path=[8708:0-78 9126:79-121]
AAGTGAATGTTTTTTAATTAAAACTTAGGTGGGAACCTGTAAAAAGGCGCACCATCGACC
GGTCATGGGCTTCGGTCCAAGATCTGAGTATGAGTTTTTATGCTGGGACCCGAAAGATGG
TG
>comp10016_c0_seq1 len=210 path=[9869:0-132 11973:133-209]
TTGTAGTATTTATGAATAAAACTGACCAAGTTGATGATGAAGAACTTGTTGAGCTTGTTG
AAATGGAGCTTCGCGACCTATTAAATGAATATGAATTCCCAGGTGATGATATTCCTATAG
TTAAAGGTTCTGCACTTAAAGCTCTTGATAATGTTTCTGATGAAGCTGCTACTGCTTGTA
TAGCTGAACTTATGGAAGCTTGTGATAGTT
>comp10017_c0_seq0 len=103 path=[11024:0-19 11107:20-43 <p> 11572:44-
+48 11236:49-102]
TTTTTTGGTCAACTAGGTGGCTGAGAATCATTATTCTATGGTTATAAGGCACCCTTATTG
ACATTTAAACGTTGGAGGAAATCATAAATGCGTATTAAATCAA
>comp10018_c0_seq2 len=77 path=[12879:0-24 13307:25-76]
ATAAAACACAATACACCGCCCATCTTATGCGGTGTCTAAATATAACTTAACCCTATTAAG
TTTCCTAATAAACTTAT
>comp10019_c0_seq3 len=112 path=[14245:0-49 14086:50-111]
ATTTAGGACAAGTCTCTGCTGCAATGGGAACAGGAGTTGATCCTGGAGCTGCAATTGGAA
ATGTAGCAGGTGCTGCGATGGCAGCTGAGGGAGCAGGCGGATTAGCAAGTGC
expected output:
>comp10003_c0_seq1 len=166 path=748:0-22 1004:23-46 2527:47-165_weight=41
AAGTAGCCTATGCGCTACAGTAAGAAAGACAGGTGAAAAAATGGAAGTAAAACAATTAGA
TGACTACTTTGGATATACAGAAAAGGGCAGTTCCTTAGAGGGGGAATTACGAGCAGGACT
AACGACATTCTTGACAATGGCGTACATTCTGTTTGTGAACCCAGAC
Could anyone please help me out.
Thank you in advance.