Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

htmltreexpather.pl - xpath helper, creates xpath search strings from html

by Anonymous Monk
on Oct 17, 2010 at 12:29 UTC ( [id://865792]=CUFP: print w/replies, xml ) Need Help??

htmltreexpather.pl
#!/usr/bin/perl -- use strict; use warnings; use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('<html>', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; print "Demo1\n"; Demo1(); print "Demo1\n"; Demo3(); } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__'; <html> <body> <div></div> <div id="wrapper"> <div></div> <div id="outer"> <div id="inner"> <div></div> <div id="center"> <div></div> <div id="main"> <div></div> <div> <table id="wrappedcontent"> <tbody class="shnitzel" bgcolor='red'> <tr> <td> <table> <tbody> <tr> <td><strong>key1</strong></td> <td>val1</td> </tr> <tr> <td><strong>key2</strong></td> <td>val2</td> </tr> <tr> <td><strong>key3</strong></td> <td>val3</td> </tr> <tr> <td><strong>key4</strong></td> <td>val4</td> </tr> <tr> <td><strong>key5</strong></td> <td>val5</td> </tr> <tr> <td><strong>key6</strong></td> <td>val6</td> </tr> <tr> <td><strong>key7</strong></td> <td>val7</td> </tr> <tr> <td><strong>key8</strong></td> <td>val8</td> </tr> <tr> <td><strong>key9</strong></td> <td>val9</td> </tr> <tr> <td><strong>key10</strong></td> <td>val10</td> </tr> <tr> <td><strong>key11</strong></td> <td>val11</td> </tr> </tbody> </table> </td> </tr> </tbody> </table> </div> </div> </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo3 { my $html = <<'__HTML__'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/html4/loose.dtd"><html><head><meta name="generator" con +tent="DigiOnline GmbH - WebWeaver 3.4 CMS - http://www.webweaver.de"> +<title>educa.ch</title><meta http-equiv="Content-Type" content="text/ +html; charset=iso-8859-1"><link rel="stylesheet" href="101.htm"><scri +pt src="102.htm"></script><script language="JavaScript"><!-- var did='d79376'; var root=new Array('d200','d205','d73137','d1566','d79376','d'); var usefocus = 1; function check() { if ((self.focus) && (usefocus)) { self.focus(); } } // --></script></head><body bgcolor="#FFFFFF" leftmargin="0" topmargin +="0" marginwidth="0" marginheight="0" onload="check();"><table cellsp +acing="0" cellpadding="0" border="0" width="100%"><tr><td width="15" +class="popuphead"><img src="/0.gif" alt="" width="15" height="16"></t +d><td width="99%" class="popuphead">Adresse - Schulen in der Schweiz< +/td><td width="20" class="popuphead" valign="middle"><a href="#" titl +e="Print" onclick="window.print(); return false;"><img src="../pics/p +rint16x13.gif" alt="Drucken" width="16" height="13"></a></td><td widt +h="20" class="popuphead" valign="middle"><a href="#" title="close" on +click="window.close(); return false;"><img src="../pics/close21x13.gi +f" alt="Schliessen" width="21" height="13"></a></td></tr> <tr bgcolor="#B2B2B2"><td colspan="4"><img src="/0.gif" alt="" width=" +1" height="1"></td></tr></table><div class="leerzeile">&#160;</div><d +iv class="leerzeile"><img src="/0.gif" alt="" width="15"height="8">Al +tes Schulhaus Ossingen </div><div class="leerzeile">&#160;</div><d +iv><img src="/0.gif" alt="" width="15" height="8">Guntibachstrasse 10 +</div><div><img src="/0.gif" alt="" width="15" height="8"></div><div> +<img src="/0.gif" alt="" width="15" height="8">8475 &#160;Ossingen</d +iv><div class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" w +idth="15" height="8"><a href="" target="_blank"></a></div><div><img s +rc="/0.gif" alt="" width="15" height="8"><a href="mailto: sekretariat +.psossingen@bluewin.ch">sekretariat.psossingen@bluewin.ch</a></div><d +iv class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" width= +"15" height="8">Tel:<img src="/0.gif" alt="" width="6" height="8">052 + 317 15 45 </div><div><img src="/0.gif" alt="" width="15" height="8"> +Fax:<img src="/0.gif" alt="" width="4" height="8">052 317 04 42 </div +><div>&#160;</div></body></html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 0; my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_na +mes ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, $e->attr($_) } @att + ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /</ ) { $tree->parse($html); } else { $tree->parse_file($html); } $tree->eof; for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump __END__
$ perl htmltreexpather.pl select.html _tag option HTML::Element=HASH(0xb139ec) 0.1.1.0.0 Chose Some aaa /html/body/form/select/option /html/body/form/select/option /html/body[@bgcolor='red']/form[@action='/foo.cgi' and @name='queryfoo +']/select[@name='singlelist']/option[@value='aaa'] ------------------------------------------------------------------ ##################################################################
$ perl htmltreexpather.pl Usage: htmltreexpather.pl file _tag div Demo1 HTML::Element=HASH(0xb163f4) 0.1.1.1.0.1.1.1.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9va +l9key10val10key11val11 /html/body/div/div/div/div/div/div/table //table[@id='wrappedcontent'] //table[@id='wrappedcontent'] ------------------------------------------------------------------ HTML::Element=HASH(0xb16574) 0.1.1.1.0.1.1.1.0.0.0.0.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9va +l9key10val10key11val11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table //table[@id='wrappedcontent']/tbody/tr/td/table //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table ------------------------------------------------------------------ HTML::Element=HASH(0xb166c4) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.0.0.0 key1 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td +/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb16874) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.1.0.0 key2 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td +/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6b9ac) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.2.0.0 key3 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[2] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[2]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[2]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bb5c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.3.0.0 key4 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[3] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[3]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[3]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd0c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.4.0.0 key5 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[4] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[4]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[4]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bebc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.5.0.0 key6 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[5] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[5]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[5]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c06c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.6.0.0 key7 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[6] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[6]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[6]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c21c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.7.0.0 key8 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[7] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[7]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[7]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c3cc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.8.0.0 key9 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[8] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[8]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[8]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c57c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.9.0.0 key10 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[9] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[9]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[9]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c72c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.10.0.0 key11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[10 +]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[10]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[10]/td/strong ------------------------------------------------------------------ ################################################################## Demo1 HTML::Element=HASH(0xb6c44c) 0.1.2 Altes Schulhaus Ossingen /html/body/div /html/body/div /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[@class='leerzeile'] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c13c) 0.1.4 Guntibachstrasse 10 /html/body/div[3] /html/body/div[3] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[3] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c2cc) 0.1.6 8475 áOssingen /html/body/div[5] /html/body/div[5] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[5] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bdec) 0.1.9 sekretariat.psossingen@bluewin.ch /html/body/div[8] /html/body/div[8] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[8] ------------------------------------------------------------------ HTML::Element=HASH(0xb6da44) 0.1.11 Tel:052 317 15 45 /html/body/div[10] /html/body/div[10] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[10] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd1c) 0.1.12 Fax:052 317 04 42 /html/body/div[11] /html/body/div[11] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[11] ------------------------------------------------------------------ ##################################################################

Replies are listed 'Best First'.
Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html (off-by-one error (OBOE) )
by Anonymous Monk on Mar 02, 2011 at 09:35 UTC

    off-by-one error (OBOE)

    Dag nabbit, there was an off-by-one error (OBOE); apparently, xpath starts counting at 1 (confirmed with xpather, xpath-checker), not 0, so change all occurances of
    my $count = 0;
    to
    my $count = 1;
Re: htmltreexpather.pl - xpather.pl -- creates xpath search strings from html/xml using XML::LibXML
by Anonymous Monk on Jun 27, 2013 at 03:57 UTC

      This should teach you 80% of everything xpath you need to know :)

      And here is updated version of star its now namespace aware and only uses local-name() ... and skips adding a bunch of xmlns attributes and its more indented look, example

      # star /*[ local-name() = "sub-group-tree" and position() = 1 ] /*[ local-name() = "fake" and position() = 1 and @rocks = "diamons" ] /*[ local-name() = "fake" and position() = 1 and @watch = "ebolex" and @id = "delicious" ] /*[ local-name() = "fake" and position() = 1 and @teeth = "wood" ] /*[ local-name() = "niagra" and contains(string(), " peels ") ]

      The code

Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html ($VERSION = 20120112 )
by Anonymous Monk on Jan 12, 2012 at 13:00 UTC

    Here is  $VERSION = 20120112 , incorporates OBO and properly escapes quotes/apostrophe in attributes

    Demo2 HTML::Element=HASH(0xb33a1c) 0.1.0 don't "quote" it's nonsense /html/body/div //div[@id='yo'] //div[@id='yo'] ------------------------------------------------------------------ HTML::Element=HASH(0xb33bdc) 0.1.0.0.2.2 nonsense /html/body/div/div/div[2]/div[2] //div[@id='yo']/div/div[2]/div[2] //div[@id='yo']/div[@q='don&apos;t']/div[@q='&quot;quote&quot;']/div[@ +q='it&apos;s'] ------------------------------------------------------------------ ##################################################################
Re: htmltreexpather.pl - xpath helper, creates xpath search strings from html
by Perlbeginner1 (Scribe) on Oct 17, 2010 at 17:17 UTC
    Does this help me in general
    - in other words. Does this code help me in general to find the xpaths?


    look forward to hear from you!

      For general use (depending on what you're trying to do) this is probably the best way to do xpaths for HTML/CSS: HTML::Selector::XPath, though it's not the same as the OP's code, it takes CSS selectors and turns them into xpath expressions.

      Does this help me in general - in other words. Does this code help me in general to find the xpaths?

      I don't know. It has helped me, and its almost as useful as firebug/xpather

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: CUFP [id://865792]
Approved by Corion
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others taking refuge in the Monastery: (4)
As of 2024-04-25 23:47 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found