UPDATE:
Hello perl monks,
So I noticed I need to study more about data structure, yet I got stuck at complex structure (foreach / nested if)problem.. I've been debugging with senior perl monks' suggestions.. I really appreciate that..
This is what I am trying to do:
1) input#1 (bwa) file gets read and important string data gets stored including Amplicon sequence ID (input#2 ID part) and Input sequence ID (input#3 ID part).
This means that I need to extract these IDs to match them later.
2) using input#1 key, match Input sequence ID (input#3 ID) from input#3 file.
3) again use input#1 hash's 2nd element (stored Reference Amplicon ID) match Reference Amplicon ID (input#2 ID) from input#2 file.
4)so the sequences (2nd element) from input#2 and input#3 can be printed out.
The main problem is that the key for Amp hash is wrong because it does not have the same ID as bwa hash and Input hash. I am not sure how to solve this problem. I cannot match only input#2 and input#3 because their ID are different. bwa (input#1)'s first element and Input(input#3)'s first element have input sequence IDs, but bwa 2nd element 1 and Amp (input#2)'s 1st element have the same IDs.
Basically I am trying to extract the two different IDs from first file, find one ID from 2nd file, find the other ID from 3rd file, then match their corresponding sequences.
So far, I figured out that foreach loop at least works, but not nested if's. because of Amp hash. It's supposed to print out all the 'print' that I added.. but what I get for my output is
'Use of uninitialized value in string eq at BSanalyzer.pl line 127' error message (something is wrong with my foreach loop's if ($bwa{$ID}1 eq $Amp{$ID}[0]) line) and
"1233out1233out1233out1233out1233out1233out1233out" which means that it does not go into my 3rd if... Why is this happening?
#!/usr/bin/perl -w
use warnings;
use strict;
# BWA alignment output (.sam)
my %bwa = ();
my $file1 = shift;
open (FILE1, "$file1") || die "Failed to open $file1 for reading : $!"
+; # Open second file
while (<FILE1>) { # Reading second hash
if ($_ =~ /^[^@]/s) {
chomp;
my @line = split /\s+/, $_;
my $ID;
if ($line[2] =~ /[^*]/) {
$ID = $line[0];
$bwa{$ID}[0] = $line[0]; # seq ID
$bwa{$ID}[1] = $line[2]; # Ref ID
$bwa{$ID}[2] = $line[5]; # CIGAR ID for insertion
#$bwa{$ID}[3] = @line[9]; # Processed seq (already C->T)
$bwa{$ID}[3] = $line[12]; # Edit distance (edited area by # of bas
+e) : NM
$bwa{$ID}[4] = $line[15]; # No. of mismatches in the alignment : X
+M
$bwa{$ID}[5] = $line[16]; # No. of gap opens for insertion : XO
$bwa{$ID}[6] = $line[17]; # No. of gap extensions for deletion :XG
$bwa{$ID}[7] = $line[18]; # Mismatching positions / bases : MD
}
}
}
close FILE1 || die "Failed to close $file1 : $!";
# ORIGINAL Reference Amplicon File (.fa)
my %Amp = ();
my $file2 = shift;
open (FILE2, "$file2")|| die "Failed to open $file2 for reading : $!";
+ # Open first file
local $/= ">";
my $first=<FILE2>;
while (<FILE2>) { # Reading first hash
chomp;
my ($ID, $Seq) = split("\n");
$Amp{$ID}[0] = $ID;
$Amp{$ID}[1] = $Seq;
}
close FILE2 || die "Failed to close $file2 : $!";
# ORIGINAL Input FASTQ Sequencing File (.fq)
my %Input = ();
my $file3 = shift;
open (FILE3, "$file3")|| die "Failed to open $file3 for reading : $!";
+ # Open first file
local $/= "@";
$first=<FILE3>;
while (<FILE3>) { # Reading first hash
chomp;
my ($ID, $Seq,undef,undef) = split("\n");
$Input{$ID}[0] = $ID;
$Input{$ID}[1] = $Seq;
}
close FILE3 || die "Failed to close $file3 : $!";
foreach my $ID (keys %bwa)
{
print "1";
if (exists $Input{$ID}[0] ){
print "2";
if ($bwa{$ID}[0] eq $Input{$ID}[0]){
print "3";
if ($bwa{$ID}[1] eq $Amp{$ID}[0]){
print "4";
if ($bwa{$ID}[3] eq "NM:i:0" && $bwa{$ID}[4] eq "XM:i:0" && $b
+wa{$ID}[5] eq "XO:i:0" && $bwa{$ID}[6] eq "XG:1:0") {
print "$Amp{$ID}[1]\n$Input{$ID}[1]";
}
else {print "4out";}
}
else {print "3out";}
}
else {print "2out";}
}
else {print "1out";}
}
exit;
Also here are three input files:
input#1:
@SQ SN:TMEM200B LN:293
@SQ SN:B3GAT2-2_P001 LN:204
Seq1Perfect 0 B3GAT2-2_P001 1 37 204M * 0 0
+ GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACG
+TCGGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTG
+GCGCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+ &a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$
+a=$aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=
+aa$a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
+ XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0
+ MD:Z:204
Seq2MM 0 B3GAT2-2_P001 1 37 204M * 0 0 GGTT
+AATTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTCGGG
+TTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCG
+CGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT &a=
+=aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aa
+a==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$
+a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa== XT
+:A:U NM:i:2 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 M
+D:Z:4G0G198
Seq3In 0 B3GAT2-2_P001 1 37 12M1I192M * 0 0
+ GGTTGGTTTTTAGTTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTAC
+GTCGGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGT
+GGCGCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+ &a==aa===a==a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=
+a$a=$aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===a
+a=aa$a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa=
+= XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i
+:1 MD:Z:204
Seq4Del 0 B3GAT2-2_P001 1 37 55M6D143M * 0 0
+ GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAGTACGTCGGG
+TTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCG
+CGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT &a=
+=aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aa
+a==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$
+a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aa XT:A:U
+ NM:i:6 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:6 MD:Z:55
+^GAAGAA143
Seq5Partial 0 B3GAT2-2_P001 1 37 204M * 0 0
+ GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTTGTTGTTAGCGAAGAAGAGTACG
+TCGGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTG
+GCGCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+ &a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$
+a=$aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=
+aa$a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
+ XT:A:U NM:i:2 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0
+ MD:Z:45C2C155
Seq6TruncB 0 B3GAT2-2_P001 1 37 189M * 0 0
+GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGT
+CGGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGG
+CGCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGT &a==aa=====a==
+====aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aaa==a$a$a$a=
+=aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$a$aa=aa==$a
+aa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa= XT:A:U NM:i:0 X0:i:1
+ X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:189
Seq7TruncF 0 B3GAT2-2_P001 16 37 189M * 0 0
+ TTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTCGGGTTGCGCGCGT
+TGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCGCGGTAGTTCG
+GGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT ===aaaaaaa===
+=aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aaa==a$a$a$a==aa=a==aa=a===
+==aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$a$aa=aa==$aaa=$a===a=aa$a
+=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa== XT:A:U NM:i:0 X0:i:1
+ X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:189
Seq8Incomplete 4 * 0 0 * * 0 0 GGTTGGTTCTTA
+TTCCTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTCGGGTTGCGCGC
+GTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCGCGGTAGTT
+CGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT &a==aa=====
+a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aaa==a$a$a
+$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$a$aa=aa=
+=$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
input#2:
>TMEM200B
CTCCTCTGCCTGGCTGGTCTTGATCCGAGCGGTCTTCCCGGTGTCTAGCTCAAGTCGCTCCTGCTGCAGC
+TTCGCTGCGGGCGGAGGAGGTCTGGAAGGAGGGGGCGGGCAGGGAGAGGCTGGAGCCGGTGACGCCCCC
+TCCTCCCGCGCTGCGGTATGTAAAGCACAGTAGGGGGGAGGTGGGGCCCGGCGAGCGACCCCTGCGGAC
+CTGGGAGGCCCGAGCGCCCCCGCCCCATTTGCTACGGTGCAGCCACGTGCGGGGGTGGGGTCGAGCCCG
+GGAGGTACTTACCCTGGAGA
>B3GAT2-2_P001
GGCTGGCCTTTACCTCCTGGAAGAGCTCCAGACTATAGGTGTTGTCGTCGTCAGCGAAGAAGAGCACGCC
+GGGCTGCGCGCGCTGGTGCTGGTGCCTCTGGCGCAGCCAGGCGAGGCCCGCGTTGCGCTGCTCAGTGGC
+GCGCGGCAGCCCGGGCCGCTTGTAGCGCCGCGGCGTGGGCACGTGCAGGTGAGTGCTGGGCAGCC
input#3:
@Seq1Perfect
GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTC
+GGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGC
+GCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq1Perfect
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
@Seq2MM
GGTTAATTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTC
+GGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGC
+GCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq2MM
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
@Seq3In
GGTTGGTTTTTAGTTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGT
+CGGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGG
+CGCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq3In
&a==aa===a==a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=
+$aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa
+$a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
@Seq4Del
GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAGTACGTCGGGTTG
+CGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCGCGG
+TAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq4Del
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aa
@Seq5Partial
GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTTGTTGTTAGCGAAGAAGAGTACGTC
+GGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGC
+GCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq5Partial
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
@Seq6TruncB
GGTTGGTTTTTATTTTTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTC
+GGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGC
+GCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGT
+Seq6TruncB
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=
@Seq7TruncF
TTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTCGGGTTGCGCGCGTTG
+GTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGCGCGCGGTAGTTCGGG
+TCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq7TruncF
===aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$aaa==a$a$a$a==a
+a=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$a$a$aa=aa==$aaa
+=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==
@Seq8Incomplete
GGTTGGTTCTTATTCCTTGGAAGAGTTTTAGATTATAGGTGTTGTCGTCGTTAGCGAAGAAGAGTACGTC
+GGGTTGCGCGCGTTGGTGTTGGTGTTTTTGGCGTAGTTAGGCGAGGTTCGCGTTGCGTTGTTTAGTGGC
+GCGCGGTAGTTCGGGTCGTTTGTAGCGTCGCGGCGTGGGTACGTGTAGGTGAGTGTTGGGTAGTT
+Seq8Incomplete
&a==aa=====a======aaaaaaa====aaa==a=aaa=a==a=$a=$a==aa$aaaaaaaaa=a$a=$
+aaa==a$a$a$a==aa=a==aa=a=====aa$a=aa==aaa$aaaa==$a$a==a$a==a===aa=aa$
+a$a$aa=aa==$aaa=$a===a=aa$a=$a$aa$a=aaa=a$a=a=aaa=aaa=a==aaa=aa==