#Given a gff file and embl file for a genome extract the upstream sequences that correspond to intergenic regions #STEP1: Parse a GFF file to get upstream regions #The idea is to create a hash using the line numbers as keys (line numbers correspond to genes), upstream regions for positive #strand genes are enclosed between $F[3] of the current line and $F[4] of the PREVIOUS line. Upstream regions for negative #strand genes are enclosed between $F[4] of the current line and $F[3] of the SUBSEQUENT line. #if the first gene record was on the positive strand then the upstream region is counted from 1 upto $F[3]. cat file.gff | perl -F'\t' -lane 'push @{$hash{$.}}, @F }{ @record_line_numbers= sort {$a<=>$b} keys %hash; foreach $element (@record_line_numbers){if($element ==1 && @{$hash{$element}}[6] eq "+" ){print "$element\t1\t@{$hash{$element}}[3]\t@{$hash{$element}}[6] "}elsif(@{$hash{$element}}[6] eq "+"){$prev_line=$element-1;print "$element\t@{$hash{$prev_line}}[4]\t@{$hash{$element}}[3]\t@{$hash{$element}}[6]" }elsif(@{$hash{$element}}[6] eq "-" ){$next_line=$element+1; print "$element\t@{$hash{$element}}[4]\t@{$hash{$next_line}}[3]\t@{$hash{$element}}[6]" } } ' | perl -F'\t' -lane 'next if(($F[2] - $F[1]) < 10); print' > file.upstream.coord.txt #extract sequences 200 nts upstream upto 5 nts after the start codon through sfetch cat file.upstream.coord.txt | perl -F'\t' -lane '$length=$F[2]-$F[1]; if($length>200 && $F[3] eq "+"){$from=$F[2]-200;$to=$F[2]+7; $name = join("_",$F[0],$from,$to,$F[3]); print "sfetch -d bacteria.embl -F \"fasta\" -f $from -t $to -r \"$name\" ." }elsif($length < 200 && $F[3] eq "+"){$from=$F[1];$to=$F[2]+7; $name=join("_", $F[0],$from,$to, $F[3]); print "sfetch -d bacteria.embl -F \"fasta\" -f $from -t $to -r \"$name\" ." }elsif($length > 200 && $F[3] eq "-"){$from=$F[1]+200; $to=$F[1]-7; $name =join("_", $F[0],$from,$to, $F[3]); print "sfetch -d bacteria.embl -F \"fasta\" -f $from -t $to -r \"$name\" ." }elsif($length <200 && $F[3] eq "-"){$from=$F[2];$to=$F[1]-7;$name=join("_", $F[0],$from,$to, $F[3]); print "bacteria.embl -F \"fasta\" -f $from -t $to -r \"$name\" ."} ' | sh > file.upstream.fa