#!/usr/bin/perl -w
#
# Proof-of-concept for using minimal memory to search huge
# files, using a sliding window, matching within the window,
# and using on /gc and pos() to restart the search at the
# correct spot whenever we slide the window.
#
# Doesn't correctly handle potential matches that overlap;
# the first fragment that matches wins.
#

use strict;
use constant BLOCKSIZE => 20; ##(8 * 1024);

my @findoffset;
my $file =  "ascii-code.htm";
search( $file, #"bighuge.log",
        sub { print $_[0], " at offset $_[1]\n"; push @findoffset,$_[1]; },
       # "<img[^>]*>");
       "javasc");
       
# Re-read file as lines
$_=0 for my ($line,$offset,$prev,$idx);
open(my $F, "<", $file) or die "$file: $!";
while (<$F>){
   $line++;
   my $len = length($_);
   next unless (($offset+=$len) >= $findoffset[$idx]);
   print "$line,$offset,$findoffset[$idx],$len:\t$_";
   $idx++;
   last if $idx > $#findoffset;
}
close ($F);

#------------------------------------------
sub search {
    my ($file, $callback, @fragments) = @_;

    my $byteoffset = 0;
    
    open(my $F, "<", $file) or die "$file: $!";
    binmode($F);

    # prime the window with two blocks (if possible)
    my $nbytes = read($F, my $window, 2 * BLOCKSIZE);

    my $re = "(" . join("|", @fragments) . ")";

    while ( $nbytes > 0 ) {

        # match as many times as we can within the
        # window, remembering the position of the
        # final match (if any).
        while ( $window =~ m/$re/oigcs ) {
            $callback->($1, $byteoffset);
        }
        my $pos = pos($window);

        # grab the next block
        $byteoffset += $nbytes; 
        $nbytes = read($F, my $block, BLOCKSIZE);
        last if $nbytes == 0;

        # slide the window by discarding the initial
        # block and appending the next. then reset
        # the starting position for matching.
        substr($window, 0, BLOCKSIZE) = '';
        $window .= $block;
        $pos -= BLOCKSIZE;
        pos($window) = $pos > 0 ? $pos : 0;
    }

    close($F);
}