comment on

Well, after a long night between you and Chmrr I learned something for sure.

I used HTML::TreeBuilder as suggested, and from the docs and some tinkering produced the following code. Its not at all more elegant than yours, nor arguably better, but whatever. Now I have no doubt that if I trawled the catacombs, or when blakem or a number of other people show up that theyll blow this all apart, but here goes anyway :-).

use warnings;
use strict;
use CGI qw(:standard :cgi-lib);
use LWP::Simple;
use HTML::TreeBuilder;


sub find_node {
    my $node=shift;        #well it might not be a font after all...
    my $hashref=shift;
    my $depth=shift;
    
    my @content=$node->content_list;
    return @content if (!ref($node) || uc($node->tag) ne "FONT");
    
    my @tmp=$node->content_list;
    
    # Build a fingerprint of the node. Numeric as a minor optimization
    # -1 is text, # is the number of children the node has, so <br> ha
+s none 
    # (usually)
    my $depthprint=join(":",map{ref $_ ? scalar $_->content_list : -1}
+ @tmp);
    # This could be neater... 
    my $fingerprint=join("<>",map{ref $_ ? $_->tag : $_ } @tmp);
    
    my ($node_id,$title,$date_node,$monk_node);        
    if ($depthprint=~/^(1:0:)?-1:1:-1$/ && # the finger print to match
        $fingerprint=~/^(a<>br<>)? by <>a<> on [^<>]+$/i) { # the node
+ to match
        #ok, this is almost definately a node header    
        if ($1) {
            #print $1;
            $node_id = CGI->new( $tmp[0]->attr('href')=~/\?(.*)/ )->pa
+ram( 'node_id' );
            $title   = ( $tmp[0]->content_list )[0];        
            ($monk_node,$date_node)=@tmp[3,4];
        } else {
            #so no a<>br<> at the start, means this is
            #probably the start of the nodes. go up and see if its
            #a td, if it is then its first child should be an h3
            #if its not, at any point bail, if it is, then the content
+s
            #of the h3 is the thread title
            #print "Start?";
            my $parent=$node->parent;
            return @content if $parent->tag ne "td";
            my @pcont=$parent->content_list;
            my $hdr=$pcont[0];
            return @content if !$hdr || !ref($hdr) || $hdr->tag ne "h3
+";
            $node_id=($hdr->content_list)[0];
            $title=$node_id;
            ($monk_node,$date_node)=@tmp[1,2];
            #print "Start!".$depth;
        }
    } else {
        return @content;
    }

    my $home_id    = CGI->new( $monk_node->attr('href')=~/\?(.*)/ )->p
+aram( 'node_id' );
    my ($date)     = ( $date_node=~/on (.*)/ );
    my ($monkname) = $monk_node->content_list;
    
    # Build the hash. this could be more elegant
    # print "($depth) $date $title $monkname";
    $hashref->{$monkname}->{$node_id}={date=>$date,title=>$title};
    $hashref->{$monkname}->{Home}=$home_id;
    return;
}                             

sub recurse {
      my ($node,$hash,$depth)=@_;
     
      # depth first search, real simple, everything is loaded in the
      # hash.
      #
      # monkname->home->id
      #         |
      #         ->id->date->value #note id sorts by date doesnt it?
      #             |
      #             ->title->value
      
      ref($_) && recurse($_,$hash,$depth+1)
          foreach find_node($node,$hash,$depth);
};

sub get_names_in_thread {
    my $id=shift;
    
    my $html;
    print "<p><STRONG>The posters from thread [id://$id]</STRONG><br>"
+;
    if ($id) {
        my $url ="http://perlmonks.org/index.pl?node_id=$id";
        $html= get( $url) or die "can't get url $!";
    } else { #for debugging
        warn "Using DATA";
        local $/;
        $html=<DATA>;
    }
    
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($html);
    
    my $hash={};
    recurse($tree,$hash,0);
    
    #everything here on is just formatting
    my @sorted=sort {$a->[1] cmp $b->[1]} 
           map {my $key=lc($_); 
                   #For { Nule }.. Keeps the weirdos on one branch :-)
                $key=~s/[^[:alpha:]]/{/g; 
                ["[id://$hash->{$_}->{Home}| $_ ]",$key]} 
           keys %$hash;
           
    my %ltrs;
    foreach (@sorted) {
        my $ltr=substr($_->[1],0,1);
        $ltrs{$ltr}=[] if !exists $ltrs{$ltr};
        push @{$ltrs{$ltr}},$_->[0];
    }

    my $ret= "<ul>\n";
    foreach (sort keys %ltrs) {
        $ret.="\t<li>\n";
        $ret.="\t\t".join(" | ",@{$ltrs{$_}})."\n";
        $ret.="\t</li>\n";
    }
    return $ret."</ul>";
}

print get_names_in_thread(110166); #123859 is big too

__DATA__
[download]

which outputs:

The posters from thread Name Space

agent00013 | Aighearach | Arguile
basicdez | beretboy | blakem | BooK | BrentDax | brianarn | buckaduck
cadfael | chaoticset | cLive ;-) | Corion | CubicSpline
davis | davorg | demerphq | dga | dragonchild | dthacker
earthboundmisfit
George_Sherston | giulienk | gregor42 | Guildenstern
Hero Zzyzzx | herveus | Hofmator
ichimunki | idnopheq
jackdied | japhy | Jouke
kwoff
lo_tech
Maestro_007 | Masem | merphq | mexnix | mikeB | MrNobo1024 | mr_mischief
poqui | PotPieMan
riffraff | runrig
scain | seanbo | shotgunefx | sifukurt | stefan k | suaveant | synapse0 | s173451000
TheoPetersen | theorbtwo | tilly | toadi | tye
Veachian64
wine
ybiC
zakzebrowski | Zaxo | Zecho
{NULE}

That was a lot of fun George_Sherston, I learned a lot. Thanks. (And BTW, I know I could have used more CGI tricks, but its been a long night, and I couldnt be bothered. Also some kind of recursion could be used to follow each reply looking for more replies, but, thats for another night :-).

Yves / DeMerphq
--
Have you registered your Name Space?
UPDATE: Fixed spelling of Chmrr

In reply to Re: Extract info from HTML by demerphq
in thread Extract info from HTML by George_Sherston

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


good chemistry is complicated, and a little bit messy -LW
	PerlMonks