http://qs321.pair.com?node_id=1040902


in reply to htmltreexpather.pl - xpath helper, creates xpath search strings from html

And here is the libxml variant

#!/usr/bin/perl -- use strict; use warnings; use XML::LibXML 1.70; ## for load_html/load_xml/location use Getopt::Long(); Main( @ARGV ); exit( 0 ); sub Usage {" Usage: ## xpatherize only terminal nodes (no descendents) ## xpatherize only terminal tags (no subtags) $0 xml_or_html_file_or_URL $0 xml_or_html_file_or_URL //tagname $0 http://example.com/?htm_signals_to_treat_it_as_html //a ## force load_xml or load_html $0 --xml xml_or_html_file_or_URL //tagname $0 --html xml_or_html_file_or_URL //tagname ## force xpatherize only terminal nodes (no descendents) ## force xpatherize only terminal tags (no subtags) $0 --terminal xml_or_html_file_or_URL //tagname ## force xpatherize all matching nodes (disable terminal) $0 --all xml_or_html_file_or_URL //tagname $0 --help \n"; } sub Main { @_ or die Usage(); my %opt; Getopt::Long::GetOptionsFromArray( \@_, \%opt, q{html|htm|ht!}, q{all|a!}, q{terminal|term|t!}, q{xml|xm|x!}, q{help|h!}, q{posy!}, # todo??? nah q{star!}, q{rats!}, q{raid!}, ); $opt{help} and return print Usage(); my( $url , $path ) = @_; my $load = $url=~/htm/i ? 'load_html' : 'load_xml'; $opt{html} and $load = 'load_html'; $opt{xml} and $load = 'load_xml'; my $terminal = 1; $path and $terminal = 0; $opt{terminal} and $terminal = 1; $opt{all} and $terminal = 0; $path or $path = '//*'; my $dom = XML::LibXML->new( qw/ recover 2 / )->$load( location => $url, ); for my $node( $dom->F( $path ) ){ next if $terminal and $node->F('.//*')->size; print #~ '# ', overload::StrVal($node), "\n", $node->nodePath,"\n", $node->fullxpath,"\n", "# \x22content\x22\n ",shorten( $node->textContent ), "\n\n------\n", ;;;;;;;;;;;; } } BEGIN { my %rep = qw{ " " ' ' } ; sub xpath_attr_escape { my( $t ) = @_; $t =~ s/(['"])/ $rep{$1} /ge; $t; } $::xpc = XML::LibXML::XPathContext->new( ); sub XML::LibXML::Node::F { my( $self, $xpath, $context ) = @_; $::xpc->findnodes( $xpath, $context || $self ); } } sub XML::LibXML::Node::POS { $_[0]->F('preceding-sibling::*[name()="'.$_[0]->getName().'"]' )-> +size+1; } sub shorten { my $longy = join '', @_; $longy =~ s/[\r\n\t]+/ /gs; my $ll = length($longy); $ll > 71 and substr( $longy, 69, $ll ) = '...'; $longy; } sub XML::LibXML::Node::fullxpath { my $node = shift; my $ret = ''; $ret .= "\n# posy\n".fullxpath_posy($node)."\n"; $ret .= "\n# star".fullxpath_star($node)."\n\n"; $ret .= "# rats\n".fullxpath_rats($node)."\n\n"; if( $ret =~ /\s\@id\s=\s"/ ){ ## something to trim? $ret .= "# raid\n".fullxpath_rats_raid($node)."\n"; $ret .= "# chop\n".fullxpath_rats_cutoff($node)."\n"; } $ret; } ## *[...]/*[...] always sub fullxpath_star { my $node = shift; #~ my $ret = "\n/" . yatts( $node ); my $ret = "\n/" . yatts( $node, !!1 ); ## why was this my $parent = $node->getParentNode; while ($parent and $parent->getParentNode()) { $ret = yatts( $parent, !!1 ) . $ret; $ret = "\n/". $ret; $parent = $parent->getParentNode(); } $ret; } ## /every[1]/node[1]/position[1]/always[1] sub fullxpath_posy { my $node = shift; my $ret = ''; my $parent = $node; while ($parent and $parent->getParentNode()) { my $pos = $parent->POS(); $ret = '['.( $pos ).']' . $ret; $ret = '/'.$parent->getName () . $ret; $parent = $parent->getParentNode (); } $ret; } sub yatts { my( $node, $dopos ) = @_; my $name = xpath_attr_escape( $node->getName() ); my @ret = qq{name() = "$name"}; if( $dopos ){ push @ret, 'position() = '.$node->POS; } for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return join '', '*[ ', join( ' and ', @ret ) , ' ]'; } sub datts { my( $node ) = @_; my @ret = 'position() = '.$node->POS; for my $att ( $node->attributes() ){ my $name = $att->getName; next if $name =~"content"; my $value = xpath_attr_escape( $att->getValue ); push @ret, qq{\@$name = "$value"}; } return \@ret; } sub fullxpath_ratsy { my $node = shift; my @stuff ; my $parent = $node; while ($parent and $parent->getParentNode()) { my $atts = datts( $parent ) ; if( @$atts > 1 ){ ## more than position my $name = xpath_attr_escape( $parent->getName() ); push @stuff, join '', '*[ ', join( ' and ', qq{name() = "$name"}, @$atts , ), ' ]', ;;;;;;;; } else { push @stuff, $parent->getName() .'['. $parent->POS .']'; } $parent = $parent->getParentNode(); } return @stuff; } ## /position[1]/whennootheratts[3]/*[ position() = 1 and @other="atts" + ] sub fullxpath_rats { return join '/', '', map {"$_\n " } reverse &fullxpath_ratsy; } ## absolute with @id trumping other attrs sub fullxpath_rats_raid { return join '/', '', map {"$_\n " } reverse &fullxpath_rats_theid; } ## if @id remove all other attributes / id's are unique right? sub fullxpath_rats_theid { return map { m{ \sname\(\)\s=\s"([^"]+)" .+? \s(\@id\s=\s"[^"]+") }xi ? "$1\[$2]" : $_ } &fullxpath_ratsy; } ## relative from first @id , with @id trumping other attrs sub fullxpath_rats_cutoff { my @stuff = &fullxpath_rats_theid; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\@id\s=\s"/i } @stuff; return join '/', ( $stuff > @stuff ? '/' : '' ), map {"$_\n " } reverse @stuff; } __END__

This xml

<?xml version="1.0" encoding="UTF-8"?> <sub-group-tree> <fake rocks="diamons"> <fake watch="ebolex" id="delicious"> <fake teeth="wood"> <niagra> peels </niagra> </fake> <fake ailment="vasomunchgestion"> <fake condition="Hungary" id="staxicemnt"> <fake disease="chroniclion"> <ip-address-ranges>192.168.0.1/24</ip-address-ranges> </fake> </fake> </fake> </fake> </fake> </sub-group-tree>

By default produces these paths to the same node

/sub-group-tree/fake/fake/fake[1]/niagra # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[1]/niagra[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ name() = "niagra" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # chop //fake[@id = "delicious"] /*[ name() = "fake" and position() = 1 and @teeth = "wood" ] /niagra[1] # "content" peels ------ /sub-group-tree/fake/fake/fake[2]/fake/fake/ip-address-ranges # posy /sub-group-tree[1]/fake[1]/fake[1]/fake[2]/fake[1]/fake[1]/ip-address- +ranges[1] # star /*[ name() = "sub-group-tree" and position() = 1 ] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id = + "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgestio +n" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and +@id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /*[ name() = "ip-address-ranges" and position() = 1 ] # rats /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ name() = "fake" and position() = 1 and @watch = "ebolex" and @id += "delicious" ] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /*[ name() = "fake" and position() = 1 and @condition = "Hungary" and + @id = "staxicemnt" ] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # raid /sub-group-tree[1] /*[ name() = "fake" and position() = 1 and @rocks = "diamons" ] /fake[@id = "delicious"] /*[ name() = "fake" and position() = 2 and @ailment = "vasomunchgesti +on" ] /fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # chop //fake[@id = "staxicemnt"] /*[ name() = "fake" and position() = 1 and @disease = "chroniclion" ] /ip-address-ranges[1] # "content" 192.168.0.1/24 ------