I decided to use your post as an opportunity for myself to learn HTML::Parser.
Being my first time at this module, it took ~2 hours to write, with the last 10% of the code
taking 90% of the time...
( FYI: that was in getting the <a> tags within the <td> tags to parse correctly )
So, here ya go -- using HTML::Parser and LWP::UserAgent :
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use HTML::Parser;
my ( $href, $ua, $req, $resp,
$tmp, $i, $p, $tr, @stats );
$href = "http://setiathome.ssl.berkeley.edu/stats/country_7.html";
$ua = LWP::UserAgent->new();
$req = new HTTP::Request('GET', $href);
$resp = $ua->request($req);
sub get_table_text {
return unless $i < 3;
my $self = shift;
my $text = shift;
$self->handler( text => sub { return if shift eq "" }, "dtext" );
( $text = $text ) =~ s/^\d+\)\s(.*)$/$1/;
if ( $i == 1 ) { $tmp .= $text; }
elsif ( $i == 2 ) { chomp ($tmp .= ":$text") }
}
sub grab_href_text {
my $self = shift;
$self->handler( text => sub { return if shift eq "" }, "dtext" );
$tmp .= shift;
}
sub end_table {
return unless shift eq "tr";
push(@stats, "$tmp\n");
undef $tmp;
$i = 0;
}
sub start {
my ( $tag, $self ) = @_;
return unless $tag =~ /^(tr|td|a)$/;
$tag =~ /td/ && do {
$i++;
$self->handler( text => \&get_table_text, "self, dtext" );
};
$tag =~ /a/ && do {
$self->handler( text => \&grab_href_text, "self, dtext" )
};
$self->handler( end => \&end_table, "tagname, self");
}
$p = HTML::Parser->new( api_version => 3 );
$p->handler( start => \&start, "tagname, self" );
$p->parse( $resp->{'_content'} );
print @stats;
Hope that's educational/usefull ... despite the conspicuous lack of comments!
(c8=