http://qs321.pair.com?node_id=388253
Category: HTML Utility
Author/Contact Info Zbigniew Lukasiak http://zby.aster.net.pl
Description: Compares two versions of a HTML page. The output is a list of "What is New" on the second one and is constructed so that can be safely included on a web page, but still contains some basic formatting, link and image relative addresses are fixed. The output contains as well some context information usefull for filtering changes (for example to reject changes that contain only the publication date or random quotes).

It's all very heuristic and there is no guarantee it would work for all sites, but I hope with some tuning it can be usefull for most. For me it worked much better than HTML::Diff. It does the parsing totally separately from the diffing, thus they both can be independently fine tuned. For parsing it uses HTML::PullParser so it shoudl be more reliable than the other diffs that use only regexp for the parsing.

It is a work in progress. I use it as a basis for my Active Bookmarks change aggregator.


{
    package Changes;
    use Data::Dumper;
    use List::Util qw/max min/;

    sub new{
        my ( $class, $string, $MAXCONTEXT ) = @_;
        return bless({string => $string, list => [], MAXCONTEXT => $MA
+XCONTEXT}, $class);
    }

    sub addchange{
        my($self, $change, $type) = @_;
        $change->{change} = substr($self->{string}, $change->{start}, 
+$change->{len}) if not defined $change->{change};
        my @list = @{$self->{list}};
        if(@list){
            my $latest = $list[$#list];
            my $offset = $change->{start} - ($latest->{start} + $lates
+t->{len});
#            print "start: $change->{start} offset: $offset\n";
            my $offstring = substr($self->{string}, $latest->{start} +
+ $latest->{len}, $offset);
            if($offset == 0 or $type ){
                $latest->{len} += $change->{len} + $offset;
                $latest->{change} .= $offstring . $change->{change};
            }else{
                push @{$self->{list}}, $change if $change->{change};
            }
        }else{
            push @{$self->{list}}, $change if $change->{change};
        }
        @list = @{$self->{list}};
        $latest = $list[$#list];
        $latest->{rawchange} = substr($self->{string}, $latest->{start
+}, $latest->{len});
#        $latest->{change} ||= $latest->{rawchange};
        my $prevend;
        if($#list > 0){
            my $prev = $list[$#list - 1];
            my $aftlen = min($self->{MAXCONTEXT}, $latest->{start} - (
+$prev->{start} + $prev->{len}));
            $prev->{after} = substr($self->{string}, $prev->{start} + 
+$prev->{len}, $aftlen); 
            $prevend = $prev->{start} + $prev->{len};
        }else{
            $prevend = 0;
        }
        my $befstart = max($latest->{start} - $self->{MAXCONTEXT}, $pr
+evend);
        my $beflen = min($self->{MAXCONTEXT}, $latest->{start} - $prev
+end);
#        print "$#list prevend: $prevend, beflen: $beflen, befstart: $
+befstart\n";
        $latest->{before} = substr($self->{string}, $befstart, $beflen
+);
        $latest->{after} = substr($self->{string}, $latest->{start} + 
+$latest->{len}, $self->{MAXCONTEXT});
    }
    sub changelist{
        my($self) = @_;
        return $self->{list};
    }
}

{

package MyDiff;

use Algorithm::Diff qw/sdiff/;
use List::Util qw/min max/;
use Data::Dumper;
use strict;

my $lineendingr = qr/(?<=<br>)(?!\n)|(?<=<br\/>)(?!\n)|(?<=<br \/>)(?!
+\n)|(?<=<li>)(?!\n)|(?<=<p>)(?!\n)|(?<=<\/tr>)(?!\n)|(?<=\n)/;

#my $lineendingr = qr/(?<=<br>)(?!\n)|(?<=\n)/;

sub new {
    my ( $class, $first, $second ) = @_;
    my @list1 = split /$lineendingr/, $first;
    my @list2 = split /$lineendingr/, $second;
    my $offset  = 0;
    my @offsets = map { $offset += length($_) } @list2;
    unshift @offsets, 0;
#    print Dumper(\@offsets);
    my @dlist   = sdiff( \@list1, \@list2 );
    my @additions;
    my ( $rel, $lastdiff );

    my $pos = 0;
    for(my $i = 0; $i < scalar @dlist; $i++) {
        my $diff = $dlist[$i];
        my $indicator = $diff->[0];
        if ( $indicator eq '+' or $indicator eq 'c') {
            my $line = '';
            my $start = 0;
            my $end = length($diff->[2]);
            if($indicator eq 'c'){
                if($i == 0 or $dlist[$i - 1][0] !~ /\+|c/){
                    $start = trimleft( $diff->[1], $diff->[2] );
                }
                if($i == $#dlist or $dlist[$i + 1][0] !~ /\+|c/){
#                    print "i $i, $#dlist, $dlist[$i + 1][0]\n";
                    $end = trimright( $diff->[1], $diff->[2] );
                }
                $line = substr( $diff->[2], $start, $end - $start) if(
+$start < $end);
            }else{
                $line = $diff->[2];
            }
            push @additions,
              {
                start  => $offsets[ $pos ] + $start,
                string => $line,
                len    => length( $line ),
              } if length($line);
        }
        $pos++ if $indicator ne '-';
    }
#    print Dumper(\@additions);
    return bless( { list => \@additions }, $class );
}

sub trimleft {
    my ( $t1, $t2 ) = @_;
    my $minlen = min( length($t1), length($t2) );
    my ( $start, $end );
    for ( $start = 0 ; $start <= $minlen ; $start++ ) {
        if ( substr( $t1, $start, 1 ) ne substr( $t2, $start, 1 ) ) {
            last;
        }
    }
#    print "trimleft $t1, $t1, $start\n";
    return ( $start);
}

sub trimright {
    my ( $t1, $t2 ) = @_;
    my $minlen = min( length($t1), length($t2) );
    my $lenlef = length($t1);
    my $lenrig = length($t2);
    my $endi;
    for ( $endi = 1 ; $endi <= $minlen ; $endi++ ) {
        if (substr( $t1, $lenlef - $endi, 1 ) ne substr( $t2, $lenrig 
+- $endi, 1 ))
        {
            last;
        }
    }
    my $end = $lenrig - $endi + 1;
    return ( $end );
}


sub additionsinrange{
    my($self, $start, $len) = @_;
    my @result;
    for my $addition (@{$self->{list}}){
        my $pocz = max($addition->{start}, $start);
        my $kon  = min($addition->{start} + $addition->{len}, $start +
+ $len);
        if($addition->{start} > $start + $len){
            last;
        }
#            print '==', $addition->{string}, "pocz: $pocz, kon: $kon,
+ start: $start, len: $len, addition->start: $addition->{start}, addit
+ion->len $addition->{len}\n";
        if($kon > $pocz){
            push @result, {
                string => substr(
                    $addition->{string}, 
                    $pocz - $addition->{start},
                    $kon - $pocz),
                start => $pocz,
                len   => $kon - $pocz};
        }
    }
    return @result;
}

}

package HTMLDiff;
use HTML::PullParser;
use URI::URL;
use Data::Dumper;

use strict;

my %allowed = (
    't' => 1,
    'a' => 1,
    'p' => 1,
    'b' => 1,
    'br' => 1,
    'em' => 1,
    'strong' => 1,
    'img' => 1,
);

my %tagstocomplete = (
    'a' => 1,
    'b' => 1,
    'em' => 1,
    'strong' => 1,
);

# a list of tags that when are started in the change then 
# the change is extended to the pairing end tag

sub filterhtml{
    my ($text1, $text2, $base) = @_;
    my $diff = MyDiff->new($text1, $text2);
    my $p = HTML::PullParser->new(
        doc => $text2,
        start => '"S", tagname, offset, length, text, attr',
        text => '"T", "t", offset, length, text',
        end   => '"E", tagname, offset, length, text',
    );
    my $output = Changes->new($text2, 300);
    my %intag;
    while ( my $token = $p->get_token ) {
        my($type, $tagname, $offset, $length, $text2, $attr) = @$token
+;
        my @additions = $diff->additionsinrange($offset, $length);
        if($intag{$tagname} and $type eq 'E'){
            $intag{$tagname}--;
            $output->addchange({ start => $offset, len => $length}, $t
+ype);
        }elsif(@additions){
            if($tagname eq 't'){
                for my $addition (@additions){
                    $output->addchange($addition);
                }
            }elsif($tagname eq 'a' or $tagname eq 'img'){
                my($change, $addrspec);
                if($tagname eq 'a'){
                    $addrspec = 'href';
                }else{
                    $addrspec = 'src';
                }
                my $url = URI::URL->new($attr->{$addrspec}, $base)->ab
+s;
                $change = "<$tagname $addrspec=\"$url\">";
                $output->addchange({ change => $change, start => $offs
+et, len => $length});
            }else{
                if($allowed{$tagname}){ 
                    $output->addchange({ start => $offset, len => $len
+gth});
                }elsif($tagname eq 'li'){
                    $output->addchange({ start => $offset, len => $len
+gth, change => '<br>'});
                }else{
                    $output->addchange({ start => $offset, len => $len
+gth, change => ''});
                }

            }
            if(($tagstocomplete{$tagname}) and $type eq 'S'){
                $intag{$tagname}++;
            }
        }
    }
    return @{$output->changelist};
}


=head1 NAME

HTMLDiff - 

=head1 SYNOPSIS

  use HTMLDiff;
  my @changes = HTMLDiff::filterhtml($text1, $text2, $uri);

=head1 DESCRIPTION

The filterhtml subroutine compares two versions of an HTML page and re
+turns 
'What is New' in the second version - this does not include 
'What was Deleted'.  The "change" field of the output is filtered
to contain valid HTML with only allowed tags (specified by 
the %HTMLDiff::allowedtags hash) and with relative addreses of links 
and images rewriten.  The third parameter - $uri is the address used 
to fix the relative addreses.

The output is a list of hashes with following fields: 
'rawchange' - the HTML text of the change, 
'change' - the change filtered for display 
'before' and 'after' - HTML context of the change.

=head1 MOTIVATION

I use it for aggregating changes made on sites that I read frequently.
+  
I based my Active Bookmarks web application on it.

=head1 LIMITATIONS

The algorithm used is very heuristic, it was tuned only for some pages
+ 
that I happened to be interested in.

=head1 AUTHOR

    Zbigniew Lukasiak
    http://zby.aster.net.pl

=head1 COPYRIGHT

This program is free software licensed under the...

    The General Public License (GPL)
    Version 2, June 1991


=head1 SEE ALSO

perl(1).

=cut

############################################# main pod documentation e
+nd ##


1; #this line is important and will help the module return a true valu
+e
__END