#!/usr/bin/perl -w
use strict;
use HTML::Parser ();

# See PYX format description
# http://www.xml.com/pub/a/2000/03/15/feature/index.html

my $parser = HTML::Parser->new(
        xml_mode        => 1,
        unbroken_text   => 1,
        ignore_elements => ['style', 'script'], # CDATA isn't supported
        start_h => [
                sub {
                        my ($tag, $attr) = @_;
                        print "($tag\n";
                        print "A$_\n-$attr->{$_}\n" foreach keys %{$attr};
                }, "tagname, attr"],
        end_h   => [
                sub {
                        print ")" . shift() . "\n";
                }, "tagname"],
        text_h  => [
                sub {
                        my $text = shift;
                        $text =~ s/^\s*|\s*$//g;
                        print "-$text\n"
                }, "dtext"],
);

die "usage: $0 file1.html > file1.pyx\n" unless @ARGV;

foreach (@ARGV){
        $parser->parse_file($_);
        $parser->eof();
}