{
package MyParser;
use base 'HTML::Parser';
sub start {
my($self, $tagname, $attr, $attrseq, $origtext) = @_;
$self->{divs}->[-1] .= $origtext if $self->{dc};
if ( $tagname eq 'div' ) {
push @{$self->{divs}}, '';
$self->{dc}++;
}
}
sub end {
my($self, $tagname, $origtext) = @_;
$self->{dc}-- if $tagname eq 'div';
$self->{divs}->[-1] .= $origtext if $self->{dc};
}
sub text {
my($self, $origtext, $is_cdata) = @_;
$self->{divs}->[-1] .= $origtext if $self->{dc};
}
sub comment {
my($self, $origtext) = @_;
$self->{divs}->[-1] .= "<!--$origtext-->" if $self->{dc};
}
}
my $p = MyParser->new;
$p->parse($content);
# WARNING this array deref will die if we have not put anything
# in (ie not divs) as we will try to deref an undefined value
if ( exists $p->{divs} ) {
print"($_)\n" for @{$p->{divs}};
undef $p->{divs}; # prevent leaks, and accumulating in $p object
}
Try your example on this HTML
$content = '
<html>
<div>foo
<!-- comment here -->
</div>
<div id="foo">bar
<a href="hello">somestuff</a>
</div>
</html>
';
|