#!/usr/bin/perl --
use strict;
use warnings;
our $VERSION = 20120112; # 2012-01-12
use HTML::TreeBuilder;
Main(@ARGV);
exit(0);
sub Main {
if (@_) {
PumpDump(@_);
#~ PumpDump('', qw/look_down criteria/ );
#~ PumpDump('file', qw/look_down criteria/ );
} else {
print "Usage: $0 file _tag div\n\n";
for my $ix ( 1 .. 3 ){
my $demo = "Demo$ix";
print "$demo\n";
__PACKAGE__->can($demo)->();
}
print "Usage: $0 file _tag div\n\n";
} ## end else [ if (@_) ]
} ## end sub Main
sub Demo1 {
my $html = <<'__HTML__';
key1 | val1 |
key2 | val2 |
key3 | val3 |
key4 | val4 |
key5 | val5 |
key6 | val6 |
key7 | val7 |
key8 | val8 |
key9 | val9 |
key10 | val10 |
key11 | val11 |
|
__HTML__
PumpDump( $html, _tag => qr/table|strong/i );
} ## end sub Demo1
sub Demo2 {
my $html = <<'__HTML__';
__HTML__
PumpDump( $html, _tag => qr/div/i );
} ## end sub Demo2
sub Demo3 {
my $html = <<'__HTML__';
educa.ch
Altes Schulhaus Ossingen
Guntibachstrasse 10
8475 Ossingen
Tel:
052 317 15 45
Fax:
052 317 04 42
__HTML__
PumpDump( $html, _tag => qr/div/i );
} ## end sub Demo3
sub HTML::Element::addressx {
return join(
'/',
'', # // ROOT
reverse( # so it starts at the top
map {
my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
)
);
} ## end sub HTML::Element::addressx
sub HTML::Element::addressxx {
my (@stuff) = (
map {
my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree
my $t = $_->tag;
## LEFT CAN BE A STRING
my @left = $_->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $_->attr('id') ) {
$attid = xpath_attr_escape( $attid );
$count = "[\@id='$attid']";
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
use List::MoreUtils qw[ before_incl ];
my $stuff = @stuff;
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxx
sub HTML::Element::addressxX {
my (@stuff) = (
map {
my $e = $_;
my $count = 1; # 2011-03-02-01:26:06 duh, off byone error, in xpath, start counting at 1? xpather, xpath checker agree
my $t = $e->tag;
my @left = $e->left;
for my $left (@left) {
eval { $count++ if $left->tag eq $t };
}
if ( my $attid = $e->id ) {
$attid = xpath_attr_escape( $attid );
$count = "[\@id='$attid']";
} elsif ( my @att = grep !/^id$/, $e->all_external_attr_names ) {
$count = '['
. join( ' and ',
map { sprintf q!@%s='%s'!, $_, xpath_attr_escape($e->attr($_)) } @att )
. ']';
} elsif ( $count > 1 ) {
$count = "[$count]";
} else {
$count = '';
}
$t . $count
} $_[0], # self and...
$_[0]->lineage
);
#~ use DDS; print Dump(\@stuff),"\n";
my $stuff = @stuff;
use List::MoreUtils qw[ before_incl ];
@stuff = before_incl { /\[\@id/i } @stuff;
return join(
'/',
( $stuff > @stuff ? '/' : '' ),
reverse( # so it starts at the top
@stuff
)
);
} ## end sub HTML::Element::addressxX
sub PumpDump {
my ( $html, @lookdown ) = @_;
my $tree = HTML::TreeBuilder->new();
if ( $html =~ / ) {
$tree->parse($html);
} else {
if( $html =~ /\.xml$/ ){
$tree->implicit_tags(0);
$tree->no_expand_entities(1);
$tree->ignore_unknown(0);
$tree->ignore_ignorable_whitespace(0);
$tree->no_space_compacting(1);
$tree->store_comments(1);
$tree->store_pis(1);
}
$tree->parse_file($html);
}
$tree->eof;
warn $tree->as_HTML, " " if $html =~ /\.xml$/; # because it just doesn't work for xml
@lookdown = sub{1} unless @lookdown; # every tag
for my $td ( $tree->look_down(@lookdown) ) {
my $text = $td->as_trimmed_text;
next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s
print $td, "\t", $td->address, "\n";
print $text, "\n";
print $td->addressx, "\n";
print $td->addressxx, "\n";
print $td->addressxX, "\n";
print '-' x 66, "\n";
} ## end for my $td ( $tree->look_down...)
$tree->delete;
undef $tree;
print '#' x 66, "\n\n";
} ## end sub PumpDump
BEGIN {
my %rep = qw{ " " ' ' } ;
sub xpath_attr_escape {
my( $t ) = @_;
$t =~ s/(['"])/ $rep{$1} /ge;
$t;
}
}
__END__