Well, after a long night between you and
Chmrr I learned something for sure.
I used HTML::TreeBuilder as suggested, and from the docs and some tinkering produced the following code. Its not at all more elegant than yours, nor arguably better, but whatever. Now I have no doubt that if I trawled the catacombs, or when blakem or a number of other people show up that theyll blow this all apart, but here goes anyway :-).
use warnings;
use strict;
use CGI qw(:standard :cgi-lib);
use LWP::Simple;
use HTML::TreeBuilder;
sub find_node {
my $node=shift; #well it might not be a font after all...
my $hashref=shift;
my $depth=shift;
my @content=$node->content_list;
return @content if (!ref($node) || uc($node->tag) ne "FONT");
my @tmp=$node->content_list;
# Build a fingerprint of the node. Numeric as a minor optimization
# -1 is text, # is the number of children the node has, so <br> ha
+s none
# (usually)
my $depthprint=join(":",map{ref $_ ? scalar $_->content_list : -1}
+ @tmp);
# This could be neater...
my $fingerprint=join("<>",map{ref $_ ? $_->tag : $_ } @tmp);
my ($node_id,$title,$date_node,$monk_node);
if ($depthprint=~/^(1:0:)?-1:1:-1$/ && # the finger print to match
$fingerprint=~/^(a<>br<>)? by <>a<> on [^<>]+$/i) { # the node
+ to match
#ok, this is almost definately a node header
if ($1) {
#print $1;
$node_id = CGI->new( $tmp[0]->attr('href')=~/\?(.*)/ )->pa
+ram( 'node_id' );
$title = ( $tmp[0]->content_list )[0];
($monk_node,$date_node)=@tmp[3,4];
} else {
#so no a<>br<> at the start, means this is
#probably the start of the nodes. go up and see if its
#a td, if it is then its first child should be an h3
#if its not, at any point bail, if it is, then the content
+s
#of the h3 is the thread title
#print "Start?";
my $parent=$node->parent;
return @content if $parent->tag ne "td";
my @pcont=$parent->content_list;
my $hdr=$pcont[0];
return @content if !$hdr || !ref($hdr) || $hdr->tag ne "h3
+";
$node_id=($hdr->content_list)[0];
$title=$node_id;
($monk_node,$date_node)=@tmp[1,2];
#print "Start!".$depth;
}
} else {
return @content;
}
my $home_id = CGI->new( $monk_node->attr('href')=~/\?(.*)/ )->p
+aram( 'node_id' );
my ($date) = ( $date_node=~/on (.*)/ );
my ($monkname) = $monk_node->content_list;
# Build the hash. this could be more elegant
# print "($depth) $date $title $monkname";
$hashref->{$monkname}->{$node_id}={date=>$date,title=>$title};
$hashref->{$monkname}->{Home}=$home_id;
return;
}
sub recurse {
my ($node,$hash,$depth)=@_;
# depth first search, real simple, everything is loaded in the
# hash.
#
# monkname->home->id
# |
# ->id->date->value #note id sorts by date doesnt it?
# |
# ->title->value
ref($_) && recurse($_,$hash,$depth+1)
foreach find_node($node,$hash,$depth);
};
sub get_names_in_thread {
my $id=shift;
my $html;
print "<p><STRONG>The posters from thread [id://$id]</STRONG><br>"
+;
if ($id) {
my $url ="http://perlmonks.org/index.pl?node_id=$id";
$html= get( $url) or die "can't get url $!";
} else { #for debugging
warn "Using DATA";
local $/;
$html=<DATA>;
}
my $tree = HTML::TreeBuilder->new();
$tree->parse($html);
my $hash={};
recurse($tree,$hash,0);
#everything here on is just formatting
my @sorted=sort {$a->[1] cmp $b->[1]}
map {my $key=lc($_);
#For { Nule }.. Keeps the weirdos on one branch :-)
$key=~s/[^[:alpha:]]/{/g;
["[id://$hash->{$_}->{Home}| $_ ]",$key]}
keys %$hash;
my %ltrs;
foreach (@sorted) {
my $ltr=substr($_->[1],0,1);
$ltrs{$ltr}=[] if !exists $ltrs{$ltr};
push @{$ltrs{$ltr}},$_->[0];
}
my $ret= "<ul>\n";
foreach (sort keys %ltrs) {
$ret.="\t<li>\n";
$ret.="\t\t".join(" | ",@{$ltrs{$_}})."\n";
$ret.="\t</li>\n";
}
return $ret."</ul>";
}
print get_names_in_thread(110166); #123859 is big too
__DATA__
which outputs:
The posters from thread Name Space
That was a lot of fun George_Sherston, I learned a lot. Thanks. (And BTW, I know I could have used more CGI tricks, but its been a long night, and I couldnt be bothered. Also some kind of recursion could be used to follow each reply looking for more replies, but, thats for another night :-).
Yves / DeMerphq
--
Have you registered your Name Space?
UPDATE: Fixed spelling of Chmrr