note
Anonymous Monk
<p> And here is the libxml variant
<readmore><c>
#!/usr/bin/perl --
use strict;
use warnings;
use XML::LibXML 1.70; ## for load_html/load_xml/location
use Getopt::Long();
Main( @ARGV );
exit( 0 );
sub Usage {"
Usage:
## xpatherize only terminal nodes (no descendents)
## xpatherize only terminal tags (no subtags)
$0 xml_or_html_file_or_URL
$0 xml_or_html_file_or_URL //tagname
$0 http://example.com/?htm_signals_to_treat_it_as_html //a
## force load_xml or load_html
$0 --xml xml_or_html_file_or_URL //tagname
$0 --html xml_or_html_file_or_URL //tagname
## force xpatherize only terminal nodes (no descendents)
## force xpatherize only terminal tags (no subtags)
$0 --terminal xml_or_html_file_or_URL //tagname
## force xpatherize all matching nodes (disable terminal)
$0 --all xml_or_html_file_or_URL //tagname
$0 --help
\n";
}
sub Main {
@_ or die Usage();
my %opt;
Getopt::Long::GetOptionsFromArray(
\@_,
\%opt,
q{html|htm|ht!},
q{all|a!},
q{terminal|term|t!},
q{xml|xm|x!},
q{help|h!},
q{posy!}, # todo??? nah
q{star!},
q{rats!},
q{raid!},
);
$opt{help} and return print Usage();
my( $url , $path ) = @_;
my $load = $url=~/htm/i ? 'load_html' : 'load_xml';
$opt{html} and $load = 'load_html';
$opt{xml} and $load = 'load_xml';
my $terminal = 1;
$path and $terminal = 0;
$opt{terminal} and $terminal = 1;
$opt{all} and $terminal = 0;
$path or $path = '//*';
my $dom = XML::LibXML->new(
qw/ recover 2 /
)->$load(
location => $url,
);
for my $node( $dom->F( $path ) ){
next if $terminal and $node->F('.//*')->size;
print
#~ '# ', overload::StrVal($node), "\n",
$node->nodePath,"\n",
$node->fullxpath,"\n",
"# \x22content\x22\n ",shorten( $node->textContent ),
"\n\n------\n",
;;;;;;;;;;;;
}
}
BEGIN {
my %rep = qw{ " " ' ' } ;
sub xpath_attr_escape {
my( $t ) = @_;
$t =~ s/(['"])/ $rep{$1} /ge;
$t;
}
$::xpc = XML::LibXML::XPathContext->new( );
sub XML::LibXML::Node::F {
my( $self, $xpath, $context ) = @_;
$::xpc->findnodes( $xpath, $context || $self );
}
}
sub XML::LibXML::Node::POS {
$_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )->size+1;
}
sub shorten {
my $longy = join '', @_;
$longy =~ s/[\r\n\t]+/ /gs;
my $ll = length($longy);
$ll > 71 and substr( $longy, 69, $ll ) = '...';
$longy;
}
sub XML::LibXML::Node::fullxpath {
my $node = shift;
my $ret = '';
$ret .= "\n# posy\n".fullxpath_posy($node)."\n";
$ret .= "\n# star".fullxpath_star($node)."\n\n";
$ret .= "# rats\n".fullxpath_rats($node)."\n\n";
if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim?
$ret .= "# raid\n".fullxpath_rats_raid($node)."\n";
$ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n";
}
$ret;
}
## *[...]/*[...] always
sub fullxpath_star {
my $node = shift;
#~ my $ret = "\n/" . yatts( $node );
my $ret = "\n/" . yatts( $node, !!1 ); ## why was this
my $parent = $node->getParentNode;
while ($parent and $parent->getParentNode()) {
$ret = yatts( $parent, !!1 ) . $ret;
$ret = "\n/". $ret;
$parent = $parent->getParentNode();
}
$ret;
}
## /every[1]/node[1]/position[1]/always[1]
sub fullxpath_posy {
my $node = shift;
my $ret = '';
my $parent = $node;
while ($parent and $parent->getParentNode()) {
my $pos = $parent->POS();
$ret = '['.( $pos ).']' . $ret;
$ret = '/'.$parent->getName () . $ret;
$parent = $parent->getParentNode ();
}
$ret;
}
sub yatts {
my( $node, $dopos ) = @_;
my $name = xpath_attr_escape( $node->getName() );
my @ret = qq{name() = "$name"};
if( $dopos ){
push @ret, 'position() = '.$node->POS;
}
for my $att ( $node->attributes() ){
my $name = $att->getName;
next if $name =~"content";
my $value = xpath_attr_escape( $att->getValue );
push @ret, qq{\@$name = "$value"};
}
return join '', '*[ ', join( ' and ', @ret ) , ' ]';
}
sub datts {
my( $node ) = @_;
my @ret = 'position() = '.$node->POS;
for my $att ( $node->attributes() ){
my $name = $att->getName;
next if $name =~"content";
my $value = xpath_attr_escape( $att->getValue );
push @ret, qq{\@$name = "$value"};
}
return \@ret;
}
sub fullxpath_ratsy {
my $node = shift;
my @stuff ;
my $parent = $node;
while ($parent and $parent->getParentNode()) {
my $atts = datts( $parent ) ;
if( @$atts > 1 ){ ## more than position
my $name = xpath_attr_escape( $parent->getName() );
push @stuff,
join '',
'*[ ',
join(
' and ',
qq{name() = "$name"},
@$atts ,
),
' ]',
;;;;;;;;
} else {
push @stuff, $parent->getName() .'['. $parent->POS .']';
}
$parent = $parent->getParentNode();
}
return @stuff;
}
## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" ]
sub fullxpath_rats {
return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy;
}
## absolute with @id trumping other attrs
sub fullxpath_rats_raid {
return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid;
}
## if @id remove all other attributes / id's are unique right?
sub fullxpath_rats_theid {
return map {
m{
\sname\(\)\s=\s"([^"]+)"
.+?
\s(\@id\s=\s"[^"]+")
}xi
? "$1\[$2]"
: $_
} &fullxpath_ratsy;
}
## relative from first @id , with @id trumping other attrs
sub fullxpath_rats_cutoff {
my @stuff = &fullxpath_rats_theid;
use List::MoreUtils qw[ before_incl ];
my $stuff = @stuff;
@stuff = before_incl { /\@id\s=\s"/i } @stuff;
return join '/',
( $stuff > @stuff ? '/' : '' ),
map {"$_\n " } reverse @stuff;
}
__END__
</c>
</readmore>
<p>This xml <readmore><c><?xml version="1.0" encoding="UTF-8"?>
<sub-group-tree>
<fake rocks="diamons">
<fake watch="ebolex" id="delicious">
<fake teeth="wood"> <niagra> peels </niagra> </fake>
<fake ailment="vasomunchgestion">
<fake condition="Hungary" id="staxicemnt">
<fake disease="chroniclion">
<ip-address-ranges>192.168.0.1/24</ip-address-ranges>
</fake>
</fake>
</fake>
</fake>
</fake>
</sub-group-tree>
</c></readmore>
<p> By default produces these paths to the same node <readmore><c>/sub-group-tree/fake/fake/fake[1]/niagra
# posy
/sub-group-tree[1]/fake[1]/fake[1]/fake[1]/niagra[1]
# star
/*[ name() = "sub-group-tree" and position() = 1 ]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ]
/*[ name() = "fake" and position() = 1 and @teeth = "wood" ]
/*[ name() = "niagra" and position() = 1 ]
# rats
/sub-group-tree[1]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ]
/*[ name() = "fake" and position() = 1 and @teeth = "wood" ]
/niagra[1]
# raid
/sub-group-tree[1]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/fake[@id = "delicious"]
/*[ name() = "fake" and position() = 1 and @teeth = "wood" ]
/niagra[1]
# chop
//fake[@id = "delicious"]
/*[ name() = "fake" and position() = 1 and @teeth = "wood" ]
/niagra[1]
# "content"
peels
------
/sub-group-tree/fake/fake/fake[2]/fake/fake/ip-address-ranges
# posy
/sub-group-tree[1]/fake[1]/fake[1]/fake[2]/fake[1]/fake[1]/ip-address-ranges[1]
# star
/*[ name() = "sub-group-tree" and position() = 1 ]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ]
/*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestion" ]
/*[ name() = "fake" and position() = 1 and @condition = "Hungary" and @id = "staxicemnt" ]
/*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ]
/*[ name() = "ip-address-ranges" and position() = 1 ]
# rats
/sub-group-tree[1]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ]
/*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestion" ]
/*[ name() = "fake" and position() = 1 and @condition = "Hungary" and @id = "staxicemnt" ]
/*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ]
/ip-address-ranges[1]
# raid
/sub-group-tree[1]
/*[ name() = "fake" and position() = 1 and @rocks = "diamons" ]
/fake[@id = "delicious"]
/*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestion" ]
/fake[@id = "staxicemnt"]
/*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ]
/ip-address-ranges[1]
# chop
//fake[@id = "staxicemnt"]
/*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ]
/ip-address-ranges[1]
# "content"
192.168.0.1/24
------
</c></readmore>
865792
865792
1