http://qs321.pair.com?node_id=865792

htmltreexpather.pl
#!/usr/bin/perl -- use strict; use warnings; use HTML::TreeBuilder; Main(@ARGV); exit(0); sub Main { if (@_) { PumpDump(@_); #~ PumpDump('<html>', qw/look_down criteria/ ); #~ PumpDump('file', qw/look_down criteria/ ); } else { print "Usage: $0 file _tag div\n\n"; print "Demo1\n"; Demo1(); print "Demo1\n"; Demo3(); } ## end else [ if (@_) ] } ## end sub Main sub Demo1 { my $html = <<'__HTML__'; <html> <body> <div></div> <div id="wrapper"> <div></div> <div id="outer"> <div id="inner"> <div></div> <div id="center"> <div></div> <div id="main"> <div></div> <div> <table id="wrappedcontent"> <tbody class="shnitzel" bgcolor='red'> <tr> <td> <table> <tbody> <tr> <td><strong>key1</strong></td> <td>val1</td> </tr> <tr> <td><strong>key2</strong></td> <td>val2</td> </tr> <tr> <td><strong>key3</strong></td> <td>val3</td> </tr> <tr> <td><strong>key4</strong></td> <td>val4</td> </tr> <tr> <td><strong>key5</strong></td> <td>val5</td> </tr> <tr> <td><strong>key6</strong></td> <td>val6</td> </tr> <tr> <td><strong>key7</strong></td> <td>val7</td> </tr> <tr> <td><strong>key8</strong></td> <td>val8</td> </tr> <tr> <td><strong>key9</strong></td> <td>val9</td> </tr> <tr> <td><strong>key10</strong></td> <td>val10</td> </tr> <tr> <td><strong>key11</strong></td> <td>val11</td> </tr> </tbody> </table> </td> </tr> </tbody> </table> </div> </div> </div> </div> </div> </div> </body> </html> __HTML__ PumpDump( $html, _tag => qr/table|strong/i ); } ## end sub Demo1 sub Demo3 { my $html = <<'__HTML__'; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http:// +www.w3.org/TR/html4/loose.dtd"><html><head><meta name="generator" con +tent="DigiOnline GmbH - WebWeaver 3.4 CMS - http://www.webweaver.de"> +<title>educa.ch</title><meta http-equiv="Content-Type" content="text/ +html; charset=iso-8859-1"><link rel="stylesheet" href="101.htm"><scri +pt src="102.htm"></script><script language="JavaScript"><!-- var did='d79376'; var root=new Array('d200','d205','d73137','d1566','d79376','d'); var usefocus = 1; function check() { if ((self.focus) && (usefocus)) { self.focus(); } } // --></script></head><body bgcolor="#FFFFFF" leftmargin="0" topmargin +="0" marginwidth="0" marginheight="0" onload="check();"><table cellsp +acing="0" cellpadding="0" border="0" width="100%"><tr><td width="15" +class="popuphead"><img src="/0.gif" alt="" width="15" height="16"></t +d><td width="99%" class="popuphead">Adresse - Schulen in der Schweiz< +/td><td width="20" class="popuphead" valign="middle"><a href="#" titl +e="Print" onclick="window.print(); return false;"><img src="../pics/p +rint16x13.gif" alt="Drucken" width="16" height="13"></a></td><td widt +h="20" class="popuphead" valign="middle"><a href="#" title="close" on +click="window.close(); return false;"><img src="../pics/close21x13.gi +f" alt="Schliessen" width="21" height="13"></a></td></tr> <tr bgcolor="#B2B2B2"><td colspan="4"><img src="/0.gif" alt="" width=" +1" height="1"></td></tr></table><div class="leerzeile">&#160;</div><d +iv class="leerzeile"><img src="/0.gif" alt="" width="15"height="8">Al +tes Schulhaus Ossingen </div><div class="leerzeile">&#160;</div><d +iv><img src="/0.gif" alt="" width="15" height="8">Guntibachstrasse 10 +</div><div><img src="/0.gif" alt="" width="15" height="8"></div><div> +<img src="/0.gif" alt="" width="15" height="8">8475 &#160;Ossingen</d +iv><div class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" w +idth="15" height="8"><a href="" target="_blank"></a></div><div><img s +rc="/0.gif" alt="" width="15" height="8"><a href="mailto: sekretariat +.psossingen@bluewin.ch">sekretariat.psossingen@bluewin.ch</a></div><d +iv class="leerzeile">&#160;</div><div><img src="/0.gif" alt="" width= +"15" height="8">Tel:<img src="/0.gif" alt="" width="6" height="8">052 + 317 15 45 </div><div><img src="/0.gif" alt="" width="15" height="8"> +Fax:<img src="/0.gif" alt="" width="4" height="8">052 317 04 42 </div +><div>&#160;</div></body></html> __HTML__ PumpDump( $html, _tag => qr/div/i ); } ## end sub Demo3 sub HTML::Element::addressx { return join( '/', '', # // ROOT reverse( # so it starts at the top map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ) ); } ## end sub HTML::Element::addressx sub HTML::Element::addressxx { my (@stuff) = ( map { my $count = 0; my $t = $_->tag; ## LEFT CAN BE A STRING my @left = $_->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $_->attr('id') ) { $count = "[\@id='$attid']"; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; use List::MoreUtils qw[ before_incl ]; my $stuff = @stuff; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxx sub HTML::Element::addressxX { my (@stuff) = ( map { my $e = $_; my $count = 0; my $t = $e->tag; my @left = $e->left; for my $left (@left) { eval { $count++ if $left->tag eq $t }; } if ( my $attid = $e->id ) { $count = "[\@id='$attid']"; } elsif ( my @att = grep !/^id$/, $e->all_external_attr_na +mes ) { $count = '[' . join( ' and ', map { sprintf q!@%s='%s'!, $_, $e->attr($_) } @att + ) . ']'; } elsif ( $count > 1 ) { $count = "[$count]"; } else { $count = ''; } $t . $count } $_[0], # self and... $_[0]->lineage ); #~ use DDS; print Dump(\@stuff),"\n"; my $stuff = @stuff; use List::MoreUtils qw[ before_incl ]; @stuff = before_incl { /\[\@id/i } @stuff; return join( '/', ( $stuff > @stuff ? '/' : '' ), reverse( # so it starts at the top @stuff ) ); } ## end sub HTML::Element::addressxX sub PumpDump { my ( $html, @lookdown ) = @_; my $tree = HTML::TreeBuilder->new(); if ( $html =~ /</ ) { $tree->parse($html); } else { $tree->parse_file($html); } $tree->eof; for my $td ( $tree->look_down(@lookdown) ) { my $text = $td->as_trimmed_text; next if $text =~ /^\p{Zs}*$/; ## ysth, nbsp isn't \s print $td, "\t", $td->address, "\n"; print $text, "\n"; print $td->addressx, "\n"; print $td->addressxx, "\n"; print $td->addressxX, "\n"; print '-' x 66, "\n"; } ## end for my $td ( $tree->look_down...) $tree->delete; undef $tree; print '#' x 66, "\n\n"; } ## end sub PumpDump __END__
$ perl htmltreexpather.pl select.html _tag option HTML::Element=HASH(0xb139ec) 0.1.1.0.0 Chose Some aaa /html/body/form/select/option /html/body/form/select/option /html/body[@bgcolor='red']/form[@action='/foo.cgi' and @name='queryfoo +']/select[@name='singlelist']/option[@value='aaa'] ------------------------------------------------------------------ ##################################################################
$ perl htmltreexpather.pl Usage: htmltreexpather.pl file _tag div Demo1 HTML::Element=HASH(0xb163f4) 0.1.1.1.0.1.1.1.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9va +l9key10val10key11val11 /html/body/div/div/div/div/div/div/table //table[@id='wrappedcontent'] //table[@id='wrappedcontent'] ------------------------------------------------------------------ HTML::Element=HASH(0xb16574) 0.1.1.1.0.1.1.1.0.0.0.0.0 key1val1key2val2key3val3key4val4key5val5key6val6key7val7key8val8key9va +l9key10val10key11val11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table //table[@id='wrappedcontent']/tbody/tr/td/table //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table ------------------------------------------------------------------ HTML::Element=HASH(0xb166c4) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.0.0.0 key1 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td +/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb16874) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.1.0.0 key2 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr/td +/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6b9ac) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.2.0.0 key3 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[2] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[2]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[2]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bb5c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.3.0.0 key4 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[3] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[3]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[3]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd0c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.4.0.0 key5 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[4] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[4]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[4]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6bebc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.5.0.0 key6 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[5] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[5]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[5]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c06c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.6.0.0 key7 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[6] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[6]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[6]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c21c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.7.0.0 key8 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[7] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[7]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[7]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c3cc) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.8.0.0 key9 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[8] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[8]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[8]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c57c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.9.0.0 key10 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[9] +/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[9]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[9]/td/strong ------------------------------------------------------------------ HTML::Element=HASH(0xb6c72c) 0.1.1.1.0.1.1.1.0.0.0.0.0.0.10.0.0 key11 /html/body/div/div/div/div/div/div/table/tbody/tr/td/table/tbody/tr[10 +]/td/strong //table[@id='wrappedcontent']/tbody/tr/td/table/tbody/tr[10]/td/strong //table[@id='wrappedcontent']/tbody[@bgcolor='red' and @class='shnitze +l']/tr/td/table/tbody/tr[10]/td/strong ------------------------------------------------------------------ ################################################################## Demo1 HTML::Element=HASH(0xb6c44c) 0.1.2 Altes Schulhaus Ossingen /html/body/div /html/body/div /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[@class='leerzeile'] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c13c) 0.1.4 Guntibachstrasse 10 /html/body/div[3] /html/body/div[3] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[3] ------------------------------------------------------------------ HTML::Element=HASH(0xb6c2cc) 0.1.6 8475 áOssingen /html/body/div[5] /html/body/div[5] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[5] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bdec) 0.1.9 sekretariat.psossingen@bluewin.ch /html/body/div[8] /html/body/div[8] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[8] ------------------------------------------------------------------ HTML::Element=HASH(0xb6da44) 0.1.11 Tel:052 317 15 45 /html/body/div[10] /html/body/div[10] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[10] ------------------------------------------------------------------ HTML::Element=HASH(0xb6bd1c) 0.1.12 Fax:052 317 04 42 /html/body/div[11] /html/body/div[11] /html/body[@topmargin='0' and @marginwidth='0' and @leftmargin='0' and + @bgcolor='#FFFFFF' and @onload='check();' and @marginheight='0']/div +[11] ------------------------------------------------------------------ ##################################################################