use warnings;
use strict;
use XML::SAX::Machines qw(Pipeline);
use XML::Driver::HTML;
use XML::Filter::SAX1toSAX2;
use XML::Filter::BufferText;
use XML::SAX::Writer;
my $output; # transformation target
my $writer = XML::SAX::Writer->new( Output => \$output );
my $machine = Pipeline(
'XML::Filter::SAX1toSAX2' =>
'XML::Filter::BufferText' =>
'XML::Filter::HtmlTagStripper' =>
$writer
);
my $html = new XML::Driver::HTML(
Handler => $machine,
Source => { SystemId => $ARGV[0] }
);
$html->parse();
print $output;
package XML::Filter::HtmlTagStripper;
use base qw|XML::SAX::Base|;
#
# $el->{Name} == 'marker'
# $el->{Attributes}{'{}language'} == language attribute
# $el->{Attributes}{'{}language'}{Value} == 'foo'
sub start_element {
my($self, $el) = @_;
if ( $el->{Name} =~ m/^(?:p|div|a)$/i ) {
$self->SUPER::start_element( $el );
}
}
sub end_element {
my($self, $el) = @_;
if ( $el->{Name} =~ m/^(?:p|div|a)$/i ) {
$self->SUPER::end_element( $el );
}
}
1;
####
$ cat striptags.html
Test Document
The first paragraph
the second paragraph
last modified: WHENEVER
$ perl striptags.pl striptags.html
Test DocumentThe first paragraph
the second paragraph
last modified: WHENEVER