#!/usr/bin/perl -w
use strict;
use HTML::Parser ();
# See PYX format description
# http://www.xml.com/pub/a/2000/03/15/feature/index.html
my $parser = HTML::Parser->new(
xml_mode => 1,
unbroken_text => 1,
ignore_elements => ['style', 'script'], # CDATA isn't supported
start_h => [
sub {
my ($tag, $attr) = @_;
print "($tag\n";
print "A$_\n-$attr->{$_}\n" foreach keys %{$attr};
}, "tagname, attr"],
end_h => [
sub {
print ")" . shift() . "\n";
}, "tagname"],
text_h => [
sub {
my $text = shift;
$text =~ s/^\s*|\s*$//g;
print "-$text\n"
}, "dtext"],
);
die "usage: $0 file1.html > file1.pyx\n" unless @ARGV;
foreach (@ARGV){
$parser->parse_file($_);
$parser->eof();
}