Re: Counter - number of tags per interval

The fastest way to do this is actually counterintuitive. You're looking to see the number of sequence tags in a given genomic interval. You can do something along the lines of what you've chosen to do, or you can try a different method. Read the sequence file in and add the tag counts to fixed-size genomic intervals. Then, read in the bed file and retrieve the bins (and their counts) that are in the same region. The only limitation on this method here is on the precision, as the bins are of fixed width. I'm attaching code meant for plotting heatmaps, but the idea is similar enough that I'll include code that may help with another downstream task simultaneously. It is older code, and the subroutine approach is a little silly perhaps, but the code is readable :)

#!/usr/bin/perl

use Getopt::Long;
use warnings;
use strict;

# Retrieves the normalized values in each bin in a window
# around some peak region. The output is designed to be used
# to plot a heatmap in R.

# init default variables
my $bin_size = 50;
my $scaling_factor = 10000000;
my $number_of_bins = 80;
my $fragment_length = 200;
my $bin_count = 0;
my %hash = ();
my %peaks = ();
my %ids = ();

sub main {
    # get the command line input
    my ($infile, $peaks, $outfile) = @ARGV;
    my $options = GetOptions("bin_size=i"     => \$bin_size,
                             "scal_factor=i"  => \$scaling_factor,
                             "bin_num=i"     => \$number_of_bins);
    
    $bin_count = $fragment_length / $bin_size;    
    
    # exceptions
    if (!defined $infile or !defined $peaks or !defined $outfile) {
        print STDERR "Usage: perl binForHeatmap.pl <infile> <peaks> <o
+utfile> \n";
        print STDERR "--bin_size <int> --scal_factor <int> --bin_num <
+int> \n";
        print STDERR "This script produces output that can be plotted 
+in R.\n";
        print STDERR "The input or output filename is missing!\n" and 
+exit;
    }
    
    # bin the reads and output
    print STDERR "Generating genomic bins of size $bin_size...\n";
    binReads($infile, \%hash);
    print STDERR "Outputing $number_of_bins bins around the summit of 
+the sorted peaks...\n";
    getBinsForPeaks($peaks, \%hash, $outfile);
}

sub binReads {
    my ($file, $hashref) = @_;
    
    # open the file and parse
    # expecting a bowtie format input file!
    open(IN, "$file") || die "Cannot open $file: $!\n";
    
    my $total = 0;
    
    while ( <IN> ) {
        chomp;
        my ($id, $strand, $chr, $start, undef) = split("\t", $_);
        $total++;
        if ($strand eq '+') {
            $start -= 75;
            my $bin = $start - ($start % $bin_size);
            #$hashref->{$chr}->{$bin} += 1;
            for (my $i = 0; $i < $bin_count; $i++) {
                $hashref->{$chr}->{$bin+($bin_size*$i)} += 1; # we ass
+ume a fragment size of 200 bp,
            }
        }
        else {
            $start += 75;
            my $bin = $start - ($start % $bin_size);
            #$hashref->{$chr}->{$bin} += 1;
            for (my $i = 0; $i < $bin_count; $i++) {
                $hashref->{$chr}->{$bin-($bin_size*$i)} += 1;
            }
        }
        
        $ids{$id} = 1;
        
        # get nearest bin and add 1
        #my $bin = $start - ($start % $bin_size);
        #$hashref->{$chr}->{$bin} += 1;
    }
    
    # count the number of id keys
    #$total = keys %ids;
    #print "$total\n" and exit;
    
    close IN;
    
    # normalize the bins
    my $multiplier = $scaling_factor / $total;
    foreach my $chr (keys %$hashref) {
        foreach my $bin (keys %{$hashref->{$chr}}) {
            $hashref->{$chr}->{$bin} *= $multiplier;
        }
    }
}

sub getBinsForPeaks {
    my ($file, $hashref, $outfile) = @_;
    
    open(IN, "$file") || die "Cannot open $file: $!\n";
    open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n";
    
    my $counter = 1;
    
    while ( <IN> ) {
        chomp;
        # looking for the .xls output of arem/macs!
        next if $_ =~ m/#/;
        next if $_ =~ m/start/;
        my ($chr, $start, $end, $length, $summit, $score) = split("\t"
+, $_);
        next if !defined $end;  # capture that annoying blank line! 
        
        # normalize the score by the length
        $score /= $length;
        $peaks{$counter}{"summit"} = ($start + $summit);
        $peaks{$counter}{"score"}  = $score;
        $peaks{$counter}{"chrom"}  = $chr;
        $counter++;
    }
    
    close IN;
    
    # sort on the score and output bins around the summit
    foreach my $key (sort {$peaks{$a}{"score"} <=> $peaks{$b}{"score"}
+} keys %peaks) {
        # this sorts backwards, small to large; R plots things backwar
+ds, so it plots correctly
        my $summit = $peaks{$key}{"summit"};
        my $chr    = $peaks{$key}{"chrom"};
        my $score  = $peaks{$key}{"score"};
        #print STDERR "$chr\t$summit\t$score\n" and sleep(2);
        
        my $bin = $summit - ($summit % $bin_size);
        my $left_bin = $bin - (($number_of_bins / 2) * $bin_size);
        
        #my %hash = %$hashref; # prevents annoying output to stdout wh
+en no bin exists
        
        for (my $i = 0; $i < $number_of_bins; $i++) {
            my $count = (exists $hashref->{$chr}->{($left_bin + ($i * 
+$bin_size))}) ? $hashref->{$chr}->{($left_bin + ($i * $bin_size))} : 
+0;
            #print STDERR "$count\n" and sleep(2);
            if ($i == ($number_of_bins - 1)) {
                print OUT "$count\n";
            }
            else {
                print OUT "$count\t";
            }
        }
    }
    
    close OUT;
}

# call main
main();
[download]

Bioinformatics

Comment on Re: Counter - number of tags per interval Download Code


Perl: the Markov chain saw
	PerlMonks