After discussing the topic of converting Word's HTML horror into valid and clean HTML in node Converting Word97 (or later) exported HTML to valid HTML and CB, I decided to further use a solution using that excellent HTML::TreeBuilder module.
Its advantage over tidy and others is that I can configure which tags, elements and attributes to drop. I also noticed that tidy surrendered over some special word files while HTML::TreeBuilder did not. The drawback is speed, of course.
Just to complete, here some directions to alternative solutions given by fellow monks (thanks to all):
alex pleiner <alex@zeitform.de>
zeitform Internet Dienste
#### configuration ####
# attributes to ignore
my @ignore_attr =
qw(bgcolor background color face style link alink
vlink text onblur onchange onclick ondblclick
onfocus onkeydown onkeyup onload onmousedown
onmousemove onmouseout onmouseover onmouseup
onreset onselect onunload class xmlns:w xmlns:o
xmlns
);
# tags to ignore
my @ignore_tags =
qw(font big small body dir html div span);
# tags to drop with content
my @ignore_elements =
qw(script style head o:p);
############################################################
sub clean_up_htmltree {
############################################################
my $input = shift;
my $warn = 0;
my $htmlex;
use HTML::TreeBuilder;
my $h = HTML::TreeBuilder->new;
$h->ignore_unknown(0);
$h->warn($warn);
$h->parse($input);
# drop all unwanted tags
foreach (@Conf::ignore_tags) {
$htmlex = 1, next if lc($_) eq "html"; # remove <html>...</html>?
while (my $ok = $h->look_down('_tag', "$_")) {
$ok->replace_with_content;
}
}
# drop all unwanted elements (tags w/ content)
foreach (@Conf::ignore_elements) {
while (my $ok = $h->look_down('_tag', "$_")) {
$ok->detach;
}
}
# drop all unwanted attributes
foreach my $attr (@Conf::ignore_attr) {
while (my $ok = $h->look_down( sub { defined($_[0]->attr($attr)) }
+ )) {
$ok->attr($attr, undef);
}
}
# drop unwanted script code <![....]>
foreach my $ok ( $h->look_down( sub { grep { /^<\s*!\[.+?\]\s*>$/ }
+$_[0]->content_list } ) ) {
$ok->detach_content;
}
my $output = $h->as_HTML(undef, " ", {});
# params = entities to encode, indent, optional endtags
$h = $h->delete(); # nuke it!
if ($htmlex) {
$output =~ s:^\s*<html>::m;
$output =~ s:</html>\s*$::m;
}
return $output;
}