#!/usr/bin/perl -w # # Proof-of-concept for using minimal memory to search huge # files, using a sliding window, matching within the window, # and using on /gc and pos() to restart the search at the # correct spot whenever we slide the window. # # Doesn't correctly handle potential matches that overlap; # the first fragment that matches wins. # use strict; use constant BLOCKSIZE => 20; ##(8 * 1024); my @findoffset; my $file = "ascii-code.htm"; search( $file, #"bighuge.log", sub { print $_[0], " at offset $_[1]\n"; push @findoffset,$_[1]; }, # "]*>"); "javasc"); # Re-read file as lines $_=0 for my ($line,$offset,$prev,$idx); open(my $F, "<", $file) or die "$file: $!"; while (<$F>){ $line++; my $len = length($_); next unless (($offset+=$len) >= $findoffset[$idx]); print "$line,$offset,$findoffset[$idx],$len:\t$_"; $idx++; last if $idx > $#findoffset; } close ($F); #------------------------------------------ sub search { my ($file, $callback, @fragments) = @_; my $byteoffset = 0; open(my $F, "<", $file) or die "$file: $!"; binmode($F); # prime the window with two blocks (if possible) my $nbytes = read($F, my $window, 2 * BLOCKSIZE); my $re = "(" . join("|", @fragments) . ")"; while ( $nbytes > 0 ) { # match as many times as we can within the # window, remembering the position of the # final match (if any). while ( $window =~ m/$re/oigcs ) { $callback->($1, $byteoffset); } my $pos = pos($window); # grab the next block $byteoffset += $nbytes; $nbytes = read($F, my $block, BLOCKSIZE); last if $nbytes == 0; # slide the window by discarding the initial # block and appending the next. then reset # the starting position for matching. substr($window, 0, BLOCKSIZE) = ''; $window .= $block; $pos -= BLOCKSIZE; pos($window) = $pos > 0 ? $pos : 0; } close($F); }