##
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TokeParser;
my $url ="http://perlmonks.org/index.pl?node_id=110166";
my $rawHTML = get($url); # attempt to d/l the page to mem
die "LWP::Simple messed up $!" unless ($rawHTML);
my $tp;
$tp = HTML::TokeParser->new(\$rawHTML) or die "WTF $tp gone bad: $!";
# And now -- a generic HTML::TokeParser loop
while (my $token = $tp->get_token)
{
my $ttype = shift @{ $token };
if($ttype eq "S" and $token->[0] eq "br")
{
my ( @t ) = ( undef, #$tp->get_token, #S 0
$tp->get_token, #T 1
$tp->get_token, #S 2
$tp->get_token, #T 3
$tp->get_token, #E 4
$tp->get_token, #T 5
);
if( # ($t[0][0] eq "S" and $t[0][1] eq "br") and
($t[1][0] eq "T" and $t[1][1] =~ /by/) and
($t[2][0] eq "S" and $t[2][1] eq "a") and
($t[3][0] eq "T" ) and
($t[4][0] eq "E" and $t[4][1] eq "a") and
($t[5][0] eq "T" and $t[5][1] =~ /on \w{3} \d{2}, \d{4} at/)
)
{
print $t[2][4], $t[3][1], $t[4][2], " | ";
}
}
} # endof while (my $token = $p->get_token)
undef $rawHTML; # no more raw html
undef $tp; # destroy the HTML::TokeParser object (don't need it no more)
__END__
######### WITH ADDED NEWLINES FOR READABILITY AT ><
Re: Re: Name Space
by
Hofmator
on Sep 05, 2001 at 02:27
########## BROKEN DOWN BY TOKEN
TYPE : S
####
1:br
####
2:HASH(0x1af8128)
####
3:ARRAY(0x1afeeec)
####
4:
####
####################################################
TYPE : T
####
1: by
####
2:
####
####################################################
TYPE : S
####
1:a
####
2:HASH(0x1ab4384)
####
3:ARRAY(0x1ab6324)
####
4:
####
####################################################
TYPE : T
####
1:Hofmator
####
2:
####
####################################################
TYPE : E
####
1:a
####
2:
####
####################################################
TYPE : T
####
1: on Sep 05, 2001 at 02:27
####
2:
####
####################################################
##
##
["S", $tag, $attr, $attrseq, $text]
["E", $tag, $text]
["T", $text, $is_data]
["C", $text]
["D", $text]
["PI", $token0, $text]