#### configuration #### # attributes to ignore my @ignore_attr = qw(bgcolor background color face style link alink vlink text onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload onmousedown onmousemove onmouseout onmouseover onmouseup onreset onselect onunload class xmlns:w xmlns:o xmlns ); # tags to ignore my @ignore_tags = qw(font big small body dir html div span); # tags to drop with content my @ignore_elements = qw(script style head o:p); ############################################################ sub clean_up_htmltree { ############################################################ my $input = shift; my $warn = 0; my $htmlex; use HTML::TreeBuilder; my $h = HTML::TreeBuilder->new; $h->ignore_unknown(0); $h->warn($warn); $h->parse($input); # drop all unwanted tags foreach (@Conf::ignore_tags) { $htmlex = 1, next if lc($_) eq "html"; # remove ...? while (my $ok = $h->look_down('_tag', "$_")) { $ok->replace_with_content; } } # drop all unwanted elements (tags w/ content) foreach (@Conf::ignore_elements) { while (my $ok = $h->look_down('_tag', "$_")) { $ok->detach; } } # drop all unwanted attributes foreach my $attr (@Conf::ignore_attr) { while (my $ok = $h->look_down( sub { defined($_[0]->attr($attr)) } )) { $ok->attr($attr, undef); } } # drop unwanted script code foreach my $ok ( $h->look_down( sub { grep { /^<\s*!\[.+?\]\s*>$/ } $_[0]->content_list } ) ) { $ok->detach_content; } my $output = $h->as_HTML(undef, " ", {}); # params = entities to encode, indent, optional endtags $h = $h->delete(); # nuke it! if ($htmlex) { $output =~ s:^\s*::m; $output =~ s:\s*$::m; } return $output; }