One step at a time.
This will get a list of sid numbers from all the pages available.
#! /usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use HTML::TreeBuilder;
use LWP::Simple;
use URI;
my $url = q{http://csr.wwiionline.com/scripts/services/persona/sorties
+.jsp};
my $pid = 173384;
my @sids = get_sids($url, $pid);
die qq{no sids found\n} unless @sids;
print Dumper \@sids;
sub get_sids{
my ($url, $pid) = @_;
my $page = 1;
my $uri = URI->new($url);
my ($i, @sids);
while ($page){
# build the uri
$uri->query_form(page => $page, pid => $pid);
my $uri_string = $uri->as_string;
# get the content, check for success
my $content = get $uri->as_string;
die qq{LWP get failed: $!\n} unless $content;
# build the tree
my $t = HTML::TreeBuilder->new_from_content($content)
or die qq{new from content failed: $!\n};
# get a list of all anchor tags
my @anchors = $t->look_down(_tag => q{a})
or die qq{no tables found in : $!\n};
# look at each anchor
for my $anchor (@anchors){
# get the href
my $href = $anchor->attr(q{href});
if ($href){
# test for a sid in the query fragment
my $uri = URI->new($href);
my %q = $uri->query_form;
# save it if it is there
push @sids, $q{sid} if exists $q{sid};
}
}
# see if there is another page
$page = get_next_page($t);
# avoid accidental indefinite loops
# hammering the server, adjust to suit
die if $i++ > 5;
}
# send 'em back
return @sids;
}
sub get_next_page{
my ($t) = @_;
# we want table 9
my @tables = $t->look_down(_tag => q{table});
my $table = $tables[8];
# first row
my @trs = $table->look_down(_tag => q{tr});
my $tr = $trs[0];
# second column
my @tds = $tr->look_down(_tag => q{td});
my $td = $tds[1];
# get any text
my $page_number_txt = $td->as_text;
# and test if it is a page number
# will be undef otherwise
my ($page) = $page_number_txt =~ /PAGE (\d) >/;
return $page;
}
Some points to note:
It uses HTML::TreeBuilder to parse the HTML. I find it easier than using regexes. There are many parsers available and monks have their preferences, I've settled on this one and have got used to it.
It also uses URI to construct/parse URIs. Could be overkill in this case but if someone else has done all the work I'm happy to take advantage. :-)
And all those 'q's? They're alternatives to single and double quote marks (there are some others too). You don't have to use them, again it's a preference. I started using them for the very scientific reason that my code highlighter is particularly bad at handling single and double quotes. :-)
If you download it, first see if it compiles. Then see if it runs. If the output is not as expected make a note of what Perl says about the matter and post it here. If all goes fine let us know the next step.
Fingers crossed.