=head1 NAME
HTML::Scrubber - Perl extension for scrubbing/sanitizing html
=head1 SYNOPSIS
=for example begin
#!/usr/bin/perl -w
use HTML::Scrubber;
use strict;
+ #
my $html = q[
<HR>
+ #
<B> bold
+ #
<U> underlined
+ #
<I>
+ #
<A href=#> LINK </A>
+ #
</I>
+ #
</U>
+ #
</B>
+ #
</HR>
+ #
];
+ #
my $scrubber = HTML::Scrubber->new( allow => [ qw[ p b i u hr br ]
+ ] );
+ #
print $scrubber->scrub($html);
+ #
$scrubber->deny( qw[ p b i u hr br ] );
+ #
print $scrubber->scrub($html);
+ #
=for example end
=head1 DESCRIPTION
If you wanna "scrubbing"/"sanitize" html input, this be the modulehtml
I wasn't satisfied with HTML::Sanitizer because it is
based on HTML::TreeBuilder,
so I thought I'd write something similar
that works directly with HTML::Parser.
=head1 How does it work?
First a note on documentation: just study the example below.
It's all the documentation you could need.
If you're new to perl, good luck to you.
When a tag is encountered, HTML::Scrubber
allows/denies the tag using the explicit rule if one exists.
If no explicit rule exists, Scrubber applies the default rule.
If an explicit rule exists,
but it's a simple rule(0 or 1),
the default attribute rule is applied.
=head2 EXAMPLE
=for example begin
#!/usr/bin/perl -w
use HTML::Scrubber;
use strict;
+ #
my @allow = qw[ br hr b a ];
+ #
my @rules = (
script => 0,
img => {
src => qr{^(?!http://)}i, # only relative image links allo
+wed
alt => 1,
'*' => 0,
},
);
+ #
my @default = (
0 => # default rule, deny all tags
{
'*' => 1, # default rule, allow all attributes
'href' => qr{^(?!(?:java)?script)}i,
'src' => qr{^(?!(?:java)?script)}i,
# If your perl doesn't have qr
# just use a string with length greater than 1
'cite' => '(?i-xsm:^(?!(?:java)?script))',
'language' => 0,
'name' => 1, # could be sneaky, but hey ;)
'onblur' => 0,
'onchange' => 0,
'onclick' => 0,
'ondblclick' => 0,
'onerror' => 0,
'onfocus' => 0,
'onkeydown' => 0,
'onkeypress' => 0,
'onkeyup' => 0,
'onload' => 0,
'onmousedown' => 0,
'onmousemove' => 0,
'onmouseout' => 0,
'onmouseover' => 0,
'onmouseup' => 0,
'onreset' => 0,
'onselect' => 0,
'onsubmit' => 0,
'onunload' => 0,
'src' => 0,
'type' => 0,
}
);
+ #
my $scrubber = HTML::Scrubber->new();
$scrubber->allow( @allow );
$scrubber->rules( @rules ); # key/value pairs
$scrubber->default( @default );
$scrubber->comment(1); # 1 allow, 0 deny
+ #
## preferred way to create the same object
$scrubber = HTML::Scrubber->new(
allow => \@allow,
rules => \@rules,
default => \@default,
comment => 1,
process => 0,
);
+ #
require Data::Dumper,die Data::Dumper::Dumper($scrubber) if @ARGV;
+ #
my $it = q[
<?php echo(" EVIL EVIL EVIL "); ?> <!-- asdf -->
<hr>
<I FAKE="attribute" > IN ITALICS WITH FAKE="attribute" </I><br
+>
<B> IN BOLD </B><br>
<A NAME="evil">
<A HREF="javascript:alert('die die die');"> HREF=JAVASCRIP
+T <!> </A>
<br>
<A HREF="image/bigone.jpg" ONMOUSEOVER="alert('die die die
+');">
<IMG SRC="image/smallone.jpg" ALT="ONMOUSEOVER JAVASCR
+IPT">
</A>
</A> <br>
];
+ #
print "#original text",$/, $it, $/;
print
"#scrubbed text (default ",
$scrubber->default(), # no arguments returns the current value
" comment ",
$scrubber->comment(),
" process ",
$scrubber->process(),
" )",
$/,
$scrubber->scrub($it),
$/;
+ #
$scrubber->default(1); # allow all tags by default
$scrubber->comment(0); # deny comments
+ #
print
"#scrubbed text (default ",
$scrubber->default(),
" comment ",
$scrubber->comment(),
" process ",
$scrubber->process(),
" )",
$/,
$scrubber->scrub($it),
$/;
+ #
$scrubber->process(1); # allow process instructions (danger
+ous)
$default[0] = 1; # allow all tags by default
$default[1]->{'*'} = 0; # deny all attributes by default
$scrubber->default(@default); # set the default again
+ #
print
"#scrubbed text (default ",
$scrubber->default(),
" comment ",
$scrubber->comment(),
" process ",
$scrubber->process(),
" )",
$/,
$scrubber->scrub($it),
$/;
=for example end
=cut
package HTML::Scrubber;
use HTML::Parser();
use HTML::Entities;
use vars qw[ $VERSION ];
use strict;
$VERSION = '0.01';
sub new {
my $package = shift;
my $p = HTML::Parser->new(
api_version => 3,
default_h => [\&_scrub, "self, event, tagname, attr, text"],
);
my $self = {
_p => $p,
_rules => {
'*' => 0,
},
_comment => 0,
_process => 0,
_r => "",
};
$p->{"\0_s"} = bless $self, $package;
return $self unless @_;
my %args = @_;
for my $f( qw[ default allow deny rules process comment ] ) {
next unless exists $args{$f};
if( ref $args{$f} ) {
$self->$f( @{ $args{$f} } ) ;
} else {
$self->$f( $args{$f} ) ;
}
}
return $self;
}
sub _validate {
my($t, $r, $a) = @_;
return "<$t>" unless %$a;
my %f;
for my $k( keys %$a ) {
if( exists $r->{$k} ) {
if( ref $r->{$k} || length($r->{$k}) > 1 ) {
$f{$k} = $a->{$k} if $a->{$k} =~ m{$r->{$k}};
} elsif( $r->{$k} ) {
$f{$k} = $a->{$k};
}
} elsif( exists $r->{'*'} and $r->{'*'} ) {
$f{$k} = $a->{$k};
}
}
return "<$t $r>"
if $r = join ' ',
map {
qq[$_="]
.encode_entities($f{$_})
.q["]
} keys %f;
return "<$t>";
}
sub _scrub {
my( $p, $e, $t, $a, $text ) = @_;
my $s = $p->{"\0_s"} ;
if ( $e eq 'start' ) {
if( exists $s->{_rules}->{$t} ) { # is there a specific rule
if( ref $s->{_rules}->{$t} ) { # is it complicated?(not si
+mple;)
$s->{_r} .= _validate($t, $s->{_rules}->{$t}, $a);
} elsif( $s->{_rules}->{$t} ) {
# validate using default attribute rule
$s->{_r} .= _validate($t, $s->{_rules}->{'_'}, $a);
}
} elsif( $s->{_rules}->{'*'} ) { # default allow tags
$s->{_r} .= _validate($t, $s->{_rules}->{'_'}, $a);
}
} elsif ( $e eq 'end' ) {
if( exists $s->{_rules}->{$t} ) {
$s->{_r} .= "</$t>" if $s->{_rules}->{$t};
} elsif( $s->{_rules}->{'*'} ) {
$s->{_r} .= "</$t>";
}
} elsif ( $e eq 'comment' ) {
$s->{_r} .= $text if $s->{_comment};
} elsif ( $e eq 'process' ) {
$s->{_r} .= $text if $s->{_process};
} elsif ( $e eq 'text' or $e eq 'default') {
$s->{_r} .= $text;
} elsif ( $e eq 'start_document' ) {
$s->{_r} = "";
}
}
sub comment {
return
$_[0]->{_comment}
if @_ == 1;
$_[0]->{_comment} = $_[1];
return();
}
sub process {
return
$_[0]->{_process}
if @_ == 1;
$_[0]->{_process} = $_[1];
return();
}
sub allow {
my $self = shift;
$self->{_rules}{$_}=1 for @_;
# $self->{_p}->report_tags(@_); # I can't do this one ;(
return();
}
sub deny {
my $self = shift;
$self->{_rules}{$_} = 0 for @_;
# $self->{_p}->ignore_tags(@_); # optimization ;)
return();
}
sub rules{
my $self = shift;
my %rules = @_;
for my $k(keys %rules) {
$self->{_rules}{$k} = $rules{$k};
}
return();
}
sub default {
return
$_[0]->{_rules}{'*'}
if @_ == 1;
$_[0]->{_rules}{'*'} = $_[1] if defined $_[1];
$_[0]->{_rules}{'_'} = $_[2] if defined $_[2];
return();
}
sub scrub_file {
$_[0]->{_p}->parse_file($_[1]);
return delete $_[0]->{_r};
}
sub scrub {
$_[0]->{_p}->parse($_[1]);
$_[0]->{_p}->eof();
return delete $_[0]->{_r};
}
1;
#print sprintf q[ '%-12s => %s,], "$_'", $h{$_} for sort keys %h;# per
+l!
=head2 FUN
If you don't know how to install this, try
use ExtUtils::MakeMaker;
WriteMakefile(
'NAME' => 'HTML::Scrubber',
'VERSION_FROM' => 'Scrubber.pm', # finds $VERSION
);
If you have Test::Inline, try
pod2test Scrubber.pm >scrubber.t
perl scrubber.t
=head1 CAVEATS
The interface is subject to change until I release this on CPAN.
The allow/deny/default methods are likely to go away,
so I can implement some optimizations (see L<TODO|"TODO"> for details)
+.
=head1 TODO
Top priority, allow for redirecting output to a
file || filehandle instead of
just returning a giant string.
Currently,
$p->ignore_tags( TAG, ... )
is used as an optimization.
I need to figure out the semantics of using
$p->report_tags( TAG, ... )
as an optimization option.
See L<HTML::Parser|HTML::Parser> if you're curious.
I have no plans at all of using
$p->ignore_elements( TAG, ... )
=head1 SEE ALSO
L<HTML::Parser>, L<Test::Inline>, L<HTML::Sanitizer>.
=head1 AUTHOR
D.H aka PodMaster
=for in_the_future
Please use http://rt.cpan.org/ to report bugs.
http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber
=head1 LICENSE
Copyright (c) 2003 by D.H. aka PodMaster.
All rights reserved.
This module is free software;
you can redistribute it and/or modify it under
the same terms as Perl itself.
=cut
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.