#! perl -slw use strict; my %main; ## Get the filename for and open the main file my $mfile = prompt( "Main file?" ); open MAIN, '<', $mfile or die "$mfile: $!"; ## Build a HoAs keys by the tag (chromosome?) ## Don't split the records yet to save ram. m[^([^\t]+)\t] and push @{ $main{ $1 } }, $_ or die "Bad record in $mfile @ $." while
; close MAIN; chomp @$_ for values %main; ## Chomp the arrays ## Get the filename for and open the annotations file my $afile = prompt( "Annotation file?" ); open ANNOT, '<', $afile or die "$afile: $!"; ## Open the output file open OUTPUT, '>', 'output.txt' or die $!; ## Record our start time print scalar localtime; ## Variables to remember the llast Annot tag we processed ## And how many Main records we skipped before we found an overlap my( $first, $lastTag ) = ( 0, '' ); OUTER: while( ) { chomp; ## Read and split the Annot record my( $atag, $astart, $aend, $mess ) = split "\t"; ## If the tag changed, reset the start porsition for the Main records ## and remeber the new tag $first = 0, $lastTag = $atag if $atag ne $lastTag; ## Skip completely if this Annot tag has no corresponding Main tag next unless exists $main{ $atag }; ## For each Main record with this tag, ## But skipping those we rejected last time for my $md ( @{ $main{ $atag } }[ $first .. $#{ $main{ $atag } } ] ) { ## Split the Main record. We do it over and over, but it's not hugely expensive. ## Split when building the HoAs if you have enough memory. my( $mtag, $mstart, $mend ) = split "\t", $md; ## Increment the skip and goto the next Main record ## If the end of this Main record is less than ## the start of the Annot record ++$first, next if $mend < $astart; ## Skip the rest of these Main record ## if the start of this Main record is greater than ## the end of this Annot record next OUTER if $mstart > $aend; ## We have an overlap, so output the info print OUTPUT "$atag\t$mess"; } } close ANNOT; close OUTPUT; print scalar localtime; sub prompt { printf "%s", $_[ 0 ]; chomp( my $in = ); return $in; } __END__ C:\test>664760-gen > 664760.main C:\test>wc -l 664760.main 300000 664760.main C:\test>664760-gen > 664760.annot C:\test>wc -l 664760.annot 300000 664760.annot C:\test>664760-b Main file?664760.main Annotation file?664760.annot Tue Jan 29 06:36:34 2008 Tue Jan 29 06:42:12 2008 C:\test>wc -l output.txt 112805 output.txt