Re: Extract info from HTML

Well, after a long night between you and Chmrr I learned something for sure.

I used HTML::TreeBuilder as suggested, and from the docs and some tinkering produced the following code. Its not at all more elegant than yours, nor arguably better, but whatever. Now I have no doubt that if I trawled the catacombs, or when blakem or a number of other people show up that theyll blow this all apart, but here goes anyway :-).

use warnings;
use strict;
use CGI qw(:standard :cgi-lib);
use LWP::Simple;
use HTML::TreeBuilder;


sub find_node {
    my $node=shift;        #well it might not be a font after all...
    my $hashref=shift;
    my $depth=shift;
    
    my @content=$node->content_list;
    return @content if (!ref($node) || uc($node->tag) ne "FONT");
    
    my @tmp=$node->content_list;
    
    # Build a fingerprint of the node. Numeric as a minor optimization
    # -1 is text, # is the number of children the node has, so <br> ha
+s none 
    # (usually)
    my $depthprint=join(":",map{ref $_ ? scalar $_->content_list : -1}
+ @tmp);
    # This could be neater... 
    my $fingerprint=join("<>",map{ref $_ ? $_->tag : $_ } @tmp);
    
    my ($node_id,$title,$date_node,$monk_node);        
    if ($depthprint=~/^(1:0:)?-1:1:-1$/ && # the finger print to match
        $fingerprint=~/^(a<>br<>)? by <>a<> on [^<>]+$/i) { # the node
+ to match
        #ok, this is almost definately a node header    
        if ($1) {
            #print $1;
            $node_id = CGI->new( $tmp[0]->attr('href')=~/\?(.*)/ )->pa
+ram( 'node_id' );
            $title   = ( $tmp[0]->content_list )[0];        
            ($monk_node,$date_node)=@tmp[3,4];
        } else {
            #so no a<>br<> at the start, means this is
            #probably the start of the nodes. go up and see if its
            #a td, if it is then its first child should be an h3
            #if its not, at any point bail, if it is, then the content
+s
            #of the h3 is the thread title
            #print "Start?";
            my $parent=$node->parent;
            return @content if $parent->tag ne "td";
            my @pcont=$parent->content_list;
            my $hdr=$pcont[0];
            return @content if !$hdr || !ref($hdr) || $hdr->tag ne "h3
+";
            $node_id=($hdr->content_list)[0];
            $title=$node_id;
            ($monk_node,$date_node)=@tmp[1,2];
            #print "Start!".$depth;
        }
    } else {
        return @content;
    }

    my $home_id    = CGI->new( $monk_node->attr('href')=~/\?(.*)/ )->p
+aram( 'node_id' );
    my ($date)     = ( $date_node=~/on (.*)/ );
    my ($monkname) = $monk_node->content_list;
    
    # Build the hash. this could be more elegant
    # print "($depth) $date $title $monkname";
    $hashref->{$monkname}->{$node_id}={date=>$date,title=>$title};
    $hashref->{$monkname}->{Home}=$home_id;
    return;
}                             

sub recurse {
      my ($node,$hash,$depth)=@_;
     
      # depth first search, real simple, everything is loaded in the
      # hash.
      #
      # monkname->home->id
      #         |
      #         ->id->date->value #note id sorts by date doesnt it?
      #             |
      #             ->title->value
      
      ref($_) && recurse($_,$hash,$depth+1)
          foreach find_node($node,$hash,$depth);
};

sub get_names_in_thread {
    my $id=shift;
    
    my $html;
    print "<p><STRONG>The posters from thread [id://$id]</STRONG><br>"
+;
    if ($id) {
        my $url ="http://perlmonks.org/index.pl?node_id=$id";
        $html= get( $url) or die "can't get url $!";
    } else { #for debugging
        warn "Using DATA";
        local $/;
        $html=<DATA>;
    }
    
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($html);
    
    my $hash={};
    recurse($tree,$hash,0);
    
    #everything here on is just formatting
    my @sorted=sort {$a->[1] cmp $b->[1]} 
           map {my $key=lc($_); 
                   #For { Nule }.. Keeps the weirdos on one branch :-)
                $key=~s/[^[:alpha:]]/{/g; 
                ["[id://$hash->{$_}->{Home}| $_ ]",$key]} 
           keys %$hash;
           
    my %ltrs;
    foreach (@sorted) {
        my $ltr=substr($_->[1],0,1);
        $ltrs{$ltr}=[] if !exists $ltrs{$ltr};
        push @{$ltrs{$ltr}},$_->[0];
    }

    my $ret= "<ul>\n";
    foreach (sort keys %ltrs) {
        $ret.="\t<li>\n";
        $ret.="\t\t".join(" | ",@{$ltrs{$_}})."\n";
        $ret.="\t</li>\n";
    }
    return $ret."</ul>";
}

print get_names_in_thread(110166); #123859 is big too

__DATA__
[download]

which outputs:

The posters from thread Name Space

agent00013 | Aighearach | Arguile
basicdez | beretboy | blakem | BooK | BrentDax | brianarn | buckaduck
cadfael | chaoticset | cLive ;-) | Corion | CubicSpline
davis | davorg | demerphq | dga | dragonchild | dthacker
earthboundmisfit
George_Sherston | giulienk | gregor42 | Guildenstern
Hero Zzyzzx | herveus | Hofmator
ichimunki | idnopheq
jackdied | japhy | Jouke
kwoff
lo_tech
Maestro_007 | Masem | merphq | mexnix | mikeB | MrNobo1024 | mr_mischief
poqui | PotPieMan
riffraff | runrig
scain | seanbo | shotgunefx | sifukurt | stefan k | suaveant | synapse0 | s173451000
TheoPetersen | theorbtwo | tilly | toadi | tye
Veachian64
wine
ybiC
zakzebrowski | Zaxo | Zecho
{NULE}

That was a lot of fun George_Sherston, I learned a lot. Thanks. (And BTW, I know I could have used more CGI tricks, but its been a long night, and I couldnt be bothered. Also some kind of recursion could be used to follow each reply looking for more replies, but, thats for another night :-).

Yves / DeMerphq
--
Have you registered your Name Space?
UPDATE: Fixed spelling of Chmrr

Comment on Re: Extract info from HTML Download Code


laziness, impatience, and hubris
	PerlMonks