#!/usr/bin/perl -w use strict; use HTML::Parser (); # See PYX format description # http://www.xml.com/pub/a/2000/03/15/feature/index.html my $parser = HTML::Parser->new( xml_mode => 1, unbroken_text => 1, ignore_elements => ['style', 'script'], # CDATA isn't supported start_h => [ sub { my ($tag, $attr) = @_; print "($tag\n"; print "A$_\n-$attr->{$_}\n" foreach keys %{$attr}; }, "tagname, attr"], end_h => [ sub { print ")" . shift() . "\n"; }, "tagname"], text_h => [ sub { my $text = shift; $text =~ s/^\s*|\s*$//g; print "-$text\n" }, "dtext"], ); die "usage: $0 file1.html > file1.pyx\n" unless @ARGV; foreach (@ARGV){ $parser->parse_file($_); $parser->eof(); }