=head1 NAME HTML::Scrubber - Perl extension for scrubbing/sanitizing html =head1 SYNOPSIS =for example begin #!/usr/bin/perl -w use HTML::Scrubber; use strict; # my $html = q[
# bold # underlined # # LINK # # # # # ]; # my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ] ] ); # print $scrubber->scrub($html); # $scrubber->deny( qw[ p b i u hr br ] ); # print $scrubber->scrub($html); # =for example end =head1 DESCRIPTION If you wanna "scrubbing"/"sanitize" html input, this be the modulehtml I wasn't satisfied with HTML::Sanitizer because it is based on HTML::TreeBuilder, so I thought I'd write something similar that works directly with HTML::Parser. =head1 How does it work? First a note on documentation: just study the example below. It's all the documentation you could need. If you're new to perl, good luck to you. When a tag is encountered, HTML::Scrubber allows/denies the tag using the explicit rule if one exists. If no explicit rule exists, Scrubber applies the default rule. If an explicit rule exists, but it's a simple rule(0 or 1), the default attribute rule is applied. =head2 EXAMPLE =for example begin #!/usr/bin/perl -w use HTML::Scrubber; use strict; # my @allow = qw[ br hr b a ]; # my @rules = ( script => 0, img => { src => qr{^(?!http://)}i, # only relative image links allowed alt => 1, '*' => 0, }, ); # my @default = ( 0 => # default rule, deny all tags { '*' => 1, # default rule, allow all attributes 'href' => qr{^(?!(?:java)?script)}i, 'src' => qr{^(?!(?:java)?script)}i, # If your perl doesn't have qr # just use a string with length greater than 1 'cite' => '(?i-xsm:^(?!(?:java)?script))', 'language' => 0, 'name' => 1, # could be sneaky, but hey ;) 'onblur' => 0, 'onchange' => 0, 'onclick' => 0, 'ondblclick' => 0, 'onerror' => 0, 'onfocus' => 0, 'onkeydown' => 0, 'onkeypress' => 0, 'onkeyup' => 0, 'onload' => 0, 'onmousedown' => 0, 'onmousemove' => 0, 'onmouseout' => 0, 'onmouseover' => 0, 'onmouseup' => 0, 'onreset' => 0, 'onselect' => 0, 'onsubmit' => 0, 'onunload' => 0, 'src' => 0, 'type' => 0, } ); # my $scrubber = HTML::Scrubber->new(); $scrubber->allow( @allow ); $scrubber->rules( @rules ); # key/value pairs $scrubber->default( @default ); $scrubber->comment(1); # 1 allow, 0 deny # ## preferred way to create the same object $scrubber = HTML::Scrubber->new( allow => \@allow, rules => \@rules, default => \@default, comment => 1, process => 0, ); # require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV; # my $it = q[
IN ITALICS WITH FAKE="attribute"
IN BOLD
HREF=JAVASCRIPT <!>
ONMOUSEOVER JAVASCRIPT
]; # print "#original text",$/, $it, $/; print "#scrubbed text (default ", $scrubber->default(), # no arguments returns the current value " comment ", $scrubber->comment(), " process ", $scrubber->process(), " )", $/, $scrubber->scrub($it), $/; # $scrubber->default(1); # allow all tags by default $scrubber->comment(0); # deny comments # print "#scrubbed text (default ", $scrubber->default(), " comment ", $scrubber->comment(), " process ", $scrubber->process(), " )", $/, $scrubber->scrub($it), $/; # $scrubber->process(1); # allow process instructions (dangerous) $default[0] = 1; # allow all tags by default $default[1]->{'*'} = 0; # deny all attributes by default $scrubber->default(@default); # set the default again # print "#scrubbed text (default ", $scrubber->default(), " comment ", $scrubber->comment(), " process ", $scrubber->process(), " )", $/, $scrubber->scrub($it), $/; =for example end =cut package HTML::Scrubber; use HTML::Parser(); use HTML::Entities; use vars qw[ $VERSION ]; use strict; $VERSION = '0.01'; sub new { my $package = shift; my $p = HTML::Parser->new( api_version => 3, default_h => [\&_scrub, "self, event, tagname, attr, text"], ); my $self = { _p => $p, _rules => { '*' => 0, }, _comment => 0, _process => 0, _r => "", }; $p->{"\0_s"} = bless $self, $package; return $self unless @_; my %args = @_; for my $f( qw[ default allow deny rules process comment ] ) { next unless exists $args{$f}; if( ref $args{$f} ) { $self->$f( @{ $args{$f} } ) ; } else { $self->$f( $args{$f} ) ; } } return $self; } sub _validate { my($t, $r, $a) = @_; return "<$t>" unless %$a; my %f; for my $k( keys %$a ) { if( exists $r->{$k} ) { if( ref $r->{$k} || length($r->{$k}) > 1 ) { $f{$k} = $a->{$k} if $a->{$k} =~ m{$r->{$k}}; } elsif( $r->{$k} ) { $f{$k} = $a->{$k}; } } elsif( exists $r->{'*'} and $r->{'*'} ) { $f{$k} = $a->{$k}; } } return "<$t $r>" if $r = join ' ', map { qq[$_="] .encode_entities($f{$_}) .q["] } keys %f; return "<$t>"; } sub _scrub { my( $p, $e, $t, $a, $text ) = @_; my $s = $p->{"\0_s"} ; if ( $e eq 'start' ) { if( exists $s->{_rules}->{$t} ) { # is there a specific rule if( ref $s->{_rules}->{$t} ) { # is it complicated?(not simple;) $s->{_r} .= _validate($t, $s->{_rules}->{$t}, $a); } elsif( $s->{_rules}->{$t} ) { # validate using default attribute rule $s->{_r} .= _validate($t, $s->{_rules}->{'_'}, $a); } } elsif( $s->{_rules}->{'*'} ) { # default allow tags $s->{_r} .= _validate($t, $s->{_rules}->{'_'}, $a); } } elsif ( $e eq 'end' ) { if( exists $s->{_rules}->{$t} ) { $s->{_r} .= "" if $s->{_rules}->{$t}; } elsif( $s->{_rules}->{'*'} ) { $s->{_r} .= ""; } } elsif ( $e eq 'comment' ) { $s->{_r} .= $text if $s->{_comment}; } elsif ( $e eq 'process' ) { $s->{_r} .= $text if $s->{_process}; } elsif ( $e eq 'text' or $e eq 'default') { $s->{_r} .= $text; } elsif ( $e eq 'start_document' ) { $s->{_r} = ""; } } sub comment { return $_[0]->{_comment} if @_ == 1; $_[0]->{_comment} = $_[1]; return(); } sub process { return $_[0]->{_process} if @_ == 1; $_[0]->{_process} = $_[1]; return(); } sub allow { my $self = shift; $self->{_rules}{$_}=1 for @_; # $self->{_p}->report_tags(@_); # I can't do this one ;( return(); } sub deny { my $self = shift; $self->{_rules}{$_} = 0 for @_; # $self->{_p}->ignore_tags(@_); # optimization ;) return(); } sub rules{ my $self = shift; my %rules = @_; for my $k(keys %rules) { $self->{_rules}{$k} = $rules{$k}; } return(); } sub default { return $_[0]->{_rules}{'*'} if @_ == 1; $_[0]->{_rules}{'*'} = $_[1] if defined $_[1]; $_[0]->{_rules}{'_'} = $_[2] if defined $_[2]; return(); } sub scrub_file { $_[0]->{_p}->parse_file($_[1]); return delete $_[0]->{_r}; } sub scrub { $_[0]->{_p}->parse($_[1]); $_[0]->{_p}->eof(); return delete $_[0]->{_r}; } 1; #print sprintf q[ '%-12s => %s,], "$_'", $h{$_} for sort keys %h;# perl! =head2 FUN If you don't know how to install this, try use ExtUtils::MakeMaker; WriteMakefile( 'NAME' => 'HTML::Scrubber', 'VERSION_FROM' => 'Scrubber.pm', # finds $VERSION ); If you have Test::Inline, try pod2test Scrubber.pm >scrubber.t perl scrubber.t =head1 CAVEATS The interface is subject to change until I release this on CPAN. The allow/deny/default methods are likely to go away, so I can implement some optimizations (see L for details). =head1 TODO Top priority, allow for redirecting output to a file || filehandle instead of just returning a giant string. Currently, $p->ignore_tags( TAG, ... ) is used as an optimization. I need to figure out the semantics of using $p->report_tags( TAG, ... ) as an optimization option. See L if you're curious. I have no plans at all of using $p->ignore_elements( TAG, ... ) =head1 SEE ALSO L, L, L. =head1 AUTHOR D.H aka PodMaster =for in_the_future Please use http://rt.cpan.org/ to report bugs. http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber =head1 LICENSE Copyright (c) 2003 by D.H. aka PodMaster. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut