Cleaning up HTML

bart has asked for the wisdom of the Perl Monks concerning the following question:

Replies are listed 'Best First'.

Re: Cleaning up HTML
by moritz (Cardinal) on Dec 20, 2007 at 13:15 UTC

I don't know if it does all of what you want, but it does clean up HTML ;-)

by bart (Canon) on Dec 20, 2007 at 13:22 UTC

The problem with HTML Tidy is that it does too much. It cleans up a lot of stuff I don't want it to touch, occasionally resulting in pages that just look a lot different. The page is just different HTML!

I never even use it to update a plain HTML file. I only use it to have it tell me where the errors are, so I can clean it up manually.

And now, I just want to clean up snippets, not a complete page. A lot of them.

[reply]

Re: Cleaning up HTML
by CountZero (Bishop) on Dec 20, 2007 at 16:29 UTC

HTML::Tidy

HTML Tidy

CountZero

A program should be light and agile, its subroutines connected like a string of pearls. The spirit and intent of the program should be retained throughout. There should be neither too little or too much, neither needless loops nor useless variables, neither lack of structure nor overwhelming rigidity." - The Tao of Programming, 4.1 - Geoffrey James

[reply]

Re: Cleaning up HTML
by Jenda (Abbot) on Dec 20, 2007 at 18:54 UTC

sub DeMoronizeHTML {
    my $html = shift;
    return '' if $html eq '';

    for ($html) {
        # try to replace dashed paragraphs by <UL><LI>...
        s{<p style="[^"]*"><span style="[^"]*"><span><FONT size=\d+>·<
+/FONT><SPAN style="[^"]*">(?:&nbsp;)+\s*</SPAN></SPAN></SPAN>(<SPAN s
+tyle="[^"]*"><FONT size=\d+>.*?</FONT></SPAN>)</P>}{<LÍ>$1</LÍ>}gis;
        s{(?<!</LÍ>)<LÍ}{<ul><LÍ}gis;
        s{</LÍ>(?!<LÍ)}{</LÍ></ul>}gis;
        s{<(/?)LÍ>}{<$1LI>}g;

        # add <BR> after </UL>
        # s{</ul><p\b}{</ul><BR><p}gis;

        s{style="TEXT-ALIGN: center" align="center"}{align="center"}gi
+;

        #remove <p> from within <LI>
        s{<li>\s*<p>}{<li>}gi;
        s{</p>\s*</li>}{</li>}gi;
    }

    {
        my $root = HTML::TreeBuilder->new();
        $root->parse_content($html);
        $root = $root->guts();
        #$root->elementify();

        #$h->delete_ignorable_whitespace();

        # remove <span>s and <font>s with no attributes
        foreach my $tag ($root->look_down( '_tag' => qr'^(?:span|font)
+$', style => undef,
            sub {
                return ($_[0]->all_attr_names() <= 3);
            }
        )) {
            $tag->replace_with_content->delete()
        };

        #remove <span>s and <font>s with only whitespace content
        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|
+kbd|var)$',
            sub {
                my $tag = $_[0];
                while (1) {
                    foreach ($tag->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_) and $_->tag() ne 'br';
                    }
                    my $parent = $tag->parent();
                    $tag->replace_with_content->delete();
                    $tag = $parent or return 0;
                    return 0 unless $parent->tag() =~ m'^(?:span|small
+|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq
+uote|center|cite|kbd|var)$';
                }
                return 0;
            }
        );

        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|big|cite|kbd|var)$',
            sub {
                my $tag = $_[0];
        #merge <span>s that are followed by a <span> with the same att
+ributes
RIGHT:            while (my $next = $tag->right()) {
                    last if !$next or !ref($next) or $next->tag() ne $
+tag->tag();
                    foreach my $attr ($tag->all_attr_names()) {
                        next if $attr =~ /^_/;
                        last RIGHT if $tag->attr($attr) ne $next->attr
+($attr)
                    };
                    $tag->push_content($next->detach_content());
                    $next->delete();
                }
        #remove <span>s that contain just a <span> with the same attri
+butes
                my $replaced = 0;
                while (1) {
CHILD:                foreach my $child ($tag->content_list()) {
                        next unless ref($child) and $child->tag() eq $
+tag->tag();
                        foreach my $attr ($tag->all_attr_names()) {
                            next if $attr =~ /^_/;
                            next CHILD if $tag->attr($attr) ne $child-
+>attr($attr);
                        };
                        $child->replace_with_content->delete();
                        $replaced++;
                    }
                    last unless $replaced;
                    $replaced = 0;
                }
                return 0;
            }
        );

        # merge <font ... ><font ...>...</font></font>
        {
        my $fun;
        $fun = sub {
                my $tag = $_[0];
                return 0 if $tag->content_list() != 1;
                my ($child) = $tag->content_list();
                return 0 unless ref($child) and $child->tag() eq 'font
+';

                foreach my $attr ($tag->all_attr_names()) {
                    next if $attr =~ /^_/;
                    if (! defined($child->attr($attr))) {
                        $child->attr($attr, $tag->attr($attr))
                    }
                };

                $tag->replace_with_content->delete();
                $child->look_down( '_tag' => 'font', $fun);
                return 0;
            };

        $root->look_down( '_tag' => 'font', $fun);
        }

        # replace <p>s with whitespace content by <BR/>
        foreach my $tag ($root->look_down( '_tag' => 'p',
                sub {
                    foreach ($_[0]->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_);
                    }
                    return 1;
                }
            )) {
            my $style = $tag->attr("style");
            if ($style ne '' and $style =~ /\bMARGIN:\s*0\w+(?:\s+\d+\
+w+\s+0\w|;|$)/i and !$tag->content_list()) {
                $tag->delete();
            } else {
                $tag->replace_with(HTML::Element->new('br'))->delete()
+;
            }
        }

        #remove <span>s and <font>s with only whitespace content (agai
+n, after replacing <P>whitespace</p> by <BR>)
        $root->look_down( '_tag' => qr'^(?:span|small|s|strike|strong|
+b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockquote|center|cite|
+kbd|var)$',
            sub {
                my $tag = $_[0];
                while (1) {
                    foreach ($tag->content_list()) {
                        return 0
                            if !ref($_) and $_ !~ /^[\xA0\s]+$/ # \xA0
+ is a &nbsp;
                            or ref($_) and $_->tag() ne 'br';
                    }
                    my $parent = $tag->parent();
                    $tag->replace_with_content->delete();
                    $tag = $parent or return 0;
                    return 0 unless $parent->tag() =~ m'^(?:span|small
+|s|strike|strong|b|i|font|u|sub|sup|tt|code|em|pre|h\d|div|big|blockq
+uote|center|cite|kbd|var)$';
                }
                return 0;
            }
        );

        $html = $root->as_HTML(undef, undef, {});
        $root->delete();
    }

    for ($html) {
        s{^\s*<div>\s*}{}si;
        s{\s*</div>\s*$}{}si;
        s{<br\s*/?>\s+\n?}{<br/>\n}g;

        #remove excess <BR>s between paragraphs
        s{\n{3,}}{\n\n}gs;
        s{(?:<br/?>\n?){3,}}{<br/>\n<br/>\n}gs;

        #remove excess <BR>s between paragraphs
#            1 while s{<P>(?:\s+|<BR/?>)*<P\b}{<P}gi;
#            1 while s{</P>(?:\s+|<BR/?>)*</P>}{</P>}gi;

        #remove empty paragraphs at the end of the text
        s{(?:\s+|&nbsp;|<br/?>)+(</div>)?$}{$1}si;
    };

    return $html;
}

sub DeWordifyHTML {
    my $html = shift;
    return '' if $html eq '';

    for ($html) {
        s{<\?xml:namespace [^>]+>}{}g;                                
+            # remove <?xml:namespace ...>
        s{<\w[\w\d\-]*:\w[\w\d\-]*(?:\s+(?:[^">]+|"[^"]*")*)?>}{}g;   
+ # remove <o:p>
        s{</\w[\w\d\-]*:\w[\w\d\-]*>}{}g;                             
+           # remove </o:p>
        s{(<SPAN style='FONT: \d+pt)\s*'([^']+)''>}{$1 $2'>}g;        
+        # fix <span style='FONT: xpt 'FontName''>

        tr/‘’“”/''""/;                             # use ordinary quot
+es
        s/…/.../g;                            # use three dots instead
+ of the three-dot character
        s/(?:[•·]|&#61623;) ?/- /g;    # don't use fancy dots
    }

    my $result = ''; # just for sure
    $parser ||= HTML::Parser->new( api_version => 3,
        marked_sections => 1,
        boolean_attribute_value => undef,
    );
    # remove mso- styles and classes, tabstops from styles
    my $sub = sub {
        my ($tagname, $attr, $attrseq, $text) = @_;
        return '' if $tagname =~ /:/;
        my $leave = 1;
        if ($attr->{class} and $attr->{class} =~ /^mso/i) {
            $leave = 0;
            delete $attr->{class};
            @$attrseq = grep $_ ne 'class', @$attrseq;
        }
        if ($attr->{style}) {
            my @style = split /\s*;\s*/, $attr->{style};
            my $count = scalar(@style);
            @style = grep !/^(mso-.*|tab-stops.*|MARGIN-TOP: 0in)$/i, 
+@style;
            if ($count > scalar(@style)) { # we removed some
                if (@style) {
                    $attr->{style} = join '; ', @style;
                } else {
                    delete $attr->{style};
                    @$attrseq = grep $_ ne 'style', @$attrseq;
                }
                $leave = 0;
            }
        }
        if ($leave) {
            $result .= $text
        } else {
            $result .= _html_tag($tagname, $attr, $attrseq);
        }
    };
    $parser->handler(start =>  $sub, "tagname, attr, attrseq, text");
    $parser->handler(default => sub {$result .= $_[0]}, "text");

    $parser->parse($html)->eof;
    return $result
}
[download]

I guess you can tell I have to work with a fairly (censored) "HTML" at times.

Jenda
Support Denmark!
Defend the free world!

[reply]
[d/l]

Re^2: Cleaning up HTML

by Anonymous Monk on Feb 14, 2016 at 18:15 UTC

 $result .= _html_tag($tagname, $attr, $attrseq);

 Undefined subroutine &main::_html_tag called at ...

 #!/usr/bin/perl
 use HTML::Parser ();
        open(my $html, "<", $ARGV[0])...

Do you have an update for this? I realized it's 9 year old code, but I thought I'd ask.

Thanks, Art

[reply]

Re^3: Cleaning up HTML

by Jenda (Abbot) on Mar 02, 2016 at 15:50 UTC

It's incomplete. Sorry.

    sub _html_tag {
        my ( $tag, $attr, $attrseq) = @_;
        my $html;
        $html = "<$tag";
        if ($attrseq and ref($attrseq) eq 'ARRAY') {
            foreach my $key (@$attrseq) {
                if (defined $attr->{$key}) {
                    $html .= " $key="._arg_escape($attr->{$key});
                } else {
                    $html .= ' '.$key;
                }
            }
        } elsif ($attr and ref($attr)) {
            foreach my $key (keys %$attr) {
                if (defined $attr->{$key}) {
                    $html .= " $key="._arg_escape($attr->{$key});
                } else {
                    $html .= ' '.$key;
                }
            }
        }
        $html .= ">";
        return $html;
    }

    sub _arg_escape {
        my $arg = shift;
        return qq{"$arg"} if ($arg !~ /"/);
        return qq{'$arg'} if ($arg !~ /'/);
        $arg =~ s/"/&dblquote;/g;
        return qq{"$arg"};
    }
[download]

Not sure it's complete like this. I never got around to releasing this as a module. If it's not complete, either download the Jenda.Rex zip from http://jenda.krynicky.cz/#Jenda.Rex, extract the .pm and dissect that (remove all references to Win32::OLE (needed only when the module is wrapped as a COM DLL for use in VB(script)) and Win32::Registry (only used to find out the code page used by the system), remove the whole package JendaRex::CSVParser, ...) or send me a message with your email and I'll send you the module.

Jenda
Enoch was right!
Enjoy the last years of Rome.

[reply]
[d/l]

Re: Cleaning up HTML
by GrandFather (Saint) on Dec 21, 2007 at 02:39 UTC

The following is not smaller, but it does handle all the items on the wish list and should be easier to expand than your current code. Because it's using HTML::TreeBuilder and there doesn't seem to be an option to render HTML without a header and body I've stripped that stuff off using a regex, but the result is sorta ugly looking.

use strict;
use warnings;
use HTML::TreeBuilder;

my $inline = qr/^(b|i|s|del|font)$/;
my $block  = qr/^(p|table)$/;
my $html   = <<'HTML';
<FONT color="#0000ff" face="Verdana" size="1">
<p>paragraph</p>
</font>
<P align="center"><a href="#"><br/>
<font color="#0000ff" face="Verdana" size="1">&euro; 750aa</font>
<B><i>foo</b></i>
<font face="Verdana" size="1"><b><i></i></b></font>
<br /></a>
</p>
HTML

print cleanupHtml($html);

sub cleanupHtml {
    my $root = HTML::TreeBuilder->new;

    $root->parse_content(shift);
    $root->elementify();
    $root = cleanupElt($root);

    my $str = $root->as_HTML( undef, '   ', {} );
    $str =~ s!^.*?<body>(.*)</body>.*!$1!s;
    return $str;
}

sub cleanupElt {
    my $elt = shift;

    return unless ref $elt;
    expelBr($elt) if $elt->{_tag} eq 'a';

    my @elts = $elt->content_list();

    if (   $elt->{_tag} =~ $inline
        && @elts == 1
        && ref $elts[0]
        && $elts[0]->{_tag} =~ $block )
    {    # Invert order of inline and block elements
        my @nested = $elts[0]->detach_content();

        $elt->replace_with( $elts[0] );
        $elts[0]->push_content($elt);
        $elt->push_content(@nested);
        $elt  = $elts[0];
        @elts = $elt->content_list();
    }

    $_->replace_with_content()->delete() for grep { removeElt($_) } @e
+lts;
    return $elt if exists $elt->{_implicit};
    return undef if !exists $elt->{_content} || !@{ $elt->{_content} }
+;
    return $elt;
}

sub expelBr {
    my $elt = shift;

    return unless exists $elt->{_content};

    for my $index ( 0, -1 ) {
        next unless ref $elt->{_content}[$index];

        my $br = $elt->{_content}[$index];
        next unless $br->{_tag} eq 'br';
        $index == 0
          ? $br->detach()->preinsert($br)
          : $br->detach()->postinsert($br);
    }
}

sub removeElt {
    my $elt = shift;
    return unless ref $elt;

    $elt = cleanupElt($elt);
    return 1 unless $elt;
    return 0 unless $elt->{_tag} =~ $inline;
    return 1 if $elt->{_tag} eq 'font' and removeFont($elt);
    return !exists $elt->{_content};
}

sub removeFont {
    my $elt = shift;

    delete $elt->{face} if exists $elt->{face} and lc $elt->{face} eq 
+'verdana';
    delete $elt->{size} if exists $elt->{size} and $elt->{size}    eq 
+'1';
    return !grep { /^[^_]/ } keys %$elt;
}
[download]

Prints:

      <p><font color="#0000ff">paragraph</font></p>
      <p align="center">
         <br /><a href="#"><font color="#0000ff">&euro; 750aa</font> <
+b><i>foo</i></b> </a><br />
      </p>
[download]

For reference the same sample run through your code renders as:

<font color="#0000ff">
<p>paragraph</p>
</font>
<P align="center"><a href="#"><br/>
<font color="#0000ff">&euro; 750aa</font>
<B><i>foo</b></i>

</a><br />
</p>
[download]

Perl is environmentally friendly - it saves trees

[reply]
[d/l]
[select]

Re^2: Cleaning up HTML

by bart (Canon) on Dec 21, 2007 at 11:49 UTC

Thank you, Santa Claus!

[reply]

Re: Cleaning up HTML
by clinton (Priest) on Dec 23, 2007 at 12:24 UTC

HTML::StripScripts

HTML::StripScripts::Parser

This will churn through your HTML (either a complete HTML page or an HTML snippet), tidy up the HTML, fix tag nesting, remove scripts, remove unknown attributes etc.

Through the Rules => {} parameter, you can specify exactly what tags and attributes you want to allow through, adding regexes or callbacks to customise the results.

fixing tag soup, like "foo"
Yes
avoid inline elements wrapped around block elements, for example a "p" tag wrapped in "font" tags
Yes
stripping (some) empty or whitespace-only elements, such as ""
Yes with a callback such as:
 Rules => {
 * => sub {
 my ($filter,$element) = @_;
 return $element->{content} =~ m/\S/ ? 1 : 0
 }
 }
[download]
removing unnecessary tags, for example, if there's a "" tag, strip it and its corresponding "" tag, because that's my default font and size for the table - but leave in a font tag if there are any other attributes after dropping the ones with the default value: for example "" -> ""
with a callback like:
 Rules => {
 font => sub {
 my ($filter,$element) = @_;
 my $attr = $element->{attr};
 delete $attr->{size} if $attr->{size} && $attr->{
+size} eq 1;
 delete $attr->{face} if $attr->{face} && lc($attr
+->{face}) eq 'verdana';
 return keys %$attr ? 1 : 0;
 }
 }
[download]
moving " " tags out of links, when at the edge of the link text: <a href="linkto">link text </a> -> <a href="linkto">link text</a> 
This is the only one I don't have a callback for. It would need to be handled with a regex run at the end. That said, it'd be pretty easy to change sub output_stack_entry to allow you to pass back the literal HTML to use instead of reassembling the components, for instance:
 Rules => {
 a => sub {
 my ($filter,$element) = @_;
 my $content = $element->{content};
 my ($pre,$post) = ('','');
 if ($content=~s{^\s* }{}) {
 $pre = ' ';
 }
 if ($content=~s{ \s*$}{}) {
 $post = ' ';
 }
 if ($pre || $post) {
 $element->{literal} = 
 $pre.
 '<a ' . $self->_hss_join_attribs( $eleme
+nt{attr} ) . '>'
 . $content 
 . '</a>'
 . $post;
 }
 return 1;
 }
 }
[download]

Update: I'm the maintainer of HTML::StripScripts, and I added the Rules => {} parameter to HTML::StripScripts, which makes it easier to customise. But all credit for the underlying module must go the Nick Cleaton, the original author, who did a very very good job indeed

Update: Tidied up the HTML

[reply]
[d/l]
[select]

Re^2: Cleaning up HTML

by bart (Canon) on Apr 22, 2008 at 09:30 UTC

Now, how do you recommend combining two span tags? For example, I have this crufty html:

<span style="font-family:Arial"><span style="font-size:10pt; color:#00
+0080;">text</span></span>
[download]

<span style="font-family:Arial; font-size:10pt; color:#000080;">text</
+span></span>
[download]

HTML::StripScripts

Now, if I do a callback for the span like

my $p = HTML::StripScripts::Parser->new({
  Rules => {
    span => sub { my ($filter,$element) = @_;
                    print Dumper $element if $element->{content} =~ /^
+<span\W/;
                    1;
                },
    }
  }
);
[download]

$VAR1 = {
          'content' => '<span style="font-size:10pt; color:#000080;">t
+ext</span>',
          'tag' => 'span',
          'attr' => {
                      'style' => 'font-family:Arial'
                    }
        };
[download]

How do you recommend to proceed from here? Should I parse the "content" again, and how?

Also... how do you remove tags but not its content? If I return '0' from this callback sub, then both the tags and the inner HTML are gone.

[reply]
[d/l]
[select]

Re^3: Cleaning up HTML

by clinton (Priest) on Apr 22, 2008 at 09:51 UTC

I've been trying some things with this module, and it's a bit hard going.

Agreed - it makes certain things very easy, but other bits (like adding allowed attributes) are nasty and crust. As we've discussed privately, this module could do with a major API overhaul. And a new name! Stripping scripts is just a part of what this does. It's actually a very powerful HTML tidier

I'd like to combine the two span tags, merging their style attributes:

Merging adjacent tags wasn't a use that I envisaged, but that was my own lack of imagination - the next version will have some nice way of querying the parent and sibling elements.

I've been trying to think about how you could do this with the current module (using private methods and properties), but it's nasty, and would probably result in having to reparse content, because to merge two spans, you need to know:

that the child span HAS content
the the parent span has NO content
but you don't know that until you're finished with the parent span because it's a stream parser
by which time, the child span has been converted to text

As per the private message you've just sent me, maintaining the nodes as a DOM-tree seems like the only reasonable way to handle this.

UPDATE So if we're planning on building DOM-trees, this starts sounding like a job for XML::LibXML, or HTML::TreeBuilder. But HTML::StripScripts does three things that neither of these modules does (AFAIK):

it understands the meaning and context of tags, so that it can figure out that lone <li> elements cannot exist outside <ul> or <ol>
it provides a mechanism for parsing / allowing / disallowing attribute and CSS style declarations
it cleans up XSS attacks

Ideas for the new glamorous DOM-tree based version gladly accepted

[reply]
[d/l]
[select]

Re: Cleaning up HTML
by wfsp (Abbot) on Dec 22, 2007 at 07:17 UTC

Warning: extremely beta and needs more testing. :-)

#!/bin/perl5

use strict;
use warnings;
use Data::Dumper;
use HTML::TokeParser::Simple;
$Data::Dumper::Indent = 1;
$Data::Dumper::Sortkeys = 1;

my ($p, $t, $html, $clean);
$html = do{local $/;<DATA>};

open my $out, q{>}, q{clean.html} or die qq{cant open to read: $!\n};

print $out qq{dirty:\n $html\n};

# fix broken nesting

$p = HTML::TokeParser::Simple->new(\$html);
my (@stack);
my %stand_alone = (br => undef, hr =>undef);
while ($t = $p->get_token){
    if ($t->is_end_tag){
        my $tag = pop @stack;
        $clean .= qq{</$tag>};
        next;
    }
    if ($t->is_start_tag and not exists $stand_alone{$t->get_tag}){
        push @stack, $t->get_tag;
    }
    $clean .= $t->as_is;
}

print $out qq{clean 1:\n*$clean*\n};

# fix font tags

$html = $clean;
$p = HTML::TokeParser::Simple->new(\$html)
    or die qq{parse failed\n};
$clean = q{};
while ($t = $p->get_token){
  if ($t->is_start_tag(q{font})){
    $clean .= fix_font();
  }
  else{
    $clean .= $t->as_is;
  }
}

print $out qq{clean 2:\n *$clean*\n};
# fix br

$html = $clean;
$p = HTML::TokeParser::Simple->new(\$html);
$clean = q{};
while ($t = $p->get_token){
  if ($t->is_start_tag(q{a})){
    $clean .= fix_br();
  }
  else{
    $clean .= $t->as_is;
  }
}

print $out qq{clean 3:\n $clean\n};

# fix empty tags

$html = $clean;
$clean = q{};
my @strip_empty = qw{b i};
$clean = q{};
my ($in_tag, $start, $end);
my $fragment = q{};

for my $tag (@strip_empty){
    my $p = HTML::TokeParser::Simple->new(\$html);
    while (my $t = $p->get_token){
      if ($t->is_start_tag($tag)){
        $in_tag++;
        $start = $t->as_is;
        next;
      }
      if ($in_tag){
        if ($t->is_end_tag($tag)){
          $in_tag--;
          if ($fragment =~ /\S/){
                $clean .= join q{}, $start, $fragment, $t->as_is;
                $fragment = q{};
              }
        }
        else{
            $fragment .= $t->as_is;
            }
            next;
      }
      $clean .= $t->as_is;
    }
    $html = $clean;
}

print $out qq{clean 4:\n $clean};

sub fix_br {
  my (@fragment, @tags);
  my $start = $t->as_is;
  while ($t = $p->get_token){
    last if $t->is_end_tag(q{a});
    my $as_is = $t->as_is;
    my $tag = $t->get_tag;
    $tag ||= q{no tag};
    push @fragment, {
      as_is => $as_is,
      tag   => $tag,
    };
    push @tags, $tag if $t->get_tag;
  }
  my $end = $t->as_is;
  my $found_br;
  if ($tags[-1] eq q{br}){
    $found_br++;
    @fragment = grep{not $_->{tag} eq q{br}} @fragment;
  }
  my $fixed = join q{}, map{$_->{as_is}} @fragment;
  return join q{}, $start, $fixed, $end, $found_br?q{<br />}:q{};
}

sub fix_font {
  if ($t->get_attr(q{face}) eq q{Verdana} and $t->get_attr(q{size}) eq
+ q{1}){
    $t->delete_attr(q{face});
    $t->delete_attr(q{size});
  }
  my $start = $t->as_is;
  my $attr = $t->get_attr;
  my $fragment;
  while ($t = $p->get_token){
    last if $t->is_end_tag(q{font});
    $fragment .= $t->as_is;
  }
  if (not $fragment =~ /\S/){ # empty font tags?
    return  q{};
  }
  if (keys %{$attr}){
    return join q{}, $start, $fragment, $t->as_is;
  }
  else{
    return $fragment;
  }
}

__DATA__
<font color="#0000ff" face="Verdana" size="1">
</font>
<p align="center">
<a href="#">
<font color="#0000ff" face="Verdana" size="1">&euro; 750aa</font>
<br />
</a>
<b><i>bad nesting</b></i>
<b></b>
<b>bold</b>
</p>
[download]

[reply]
[d/l]

Re: Cleaning up HTML
by Anonymous Monk on Dec 23, 2007 at 23:12 UTC

On a recent job I did for http://www.uksecurityservices.com I found that Dreamweaver actually did a supurb job of cleaning up the code.

[reply]


We don't bite newbies here... much
	PerlMonks