sourcecode
rob_au
<code>
package Local::SiteRobot;
use HTML::LinkExtor;
use LWP::Simple;
use URI;
use strict;
sub new {
my $class = shift;
my %options = (
DEPTH => undef,
FOLLOW_REGEX => '',
URLS => [],
VERBOSE => 0
);
my %args = (%options, @_);
foreach (keys %args) {
die "Local::SiteRobot->new : Unknown argument option - $_" unless exists $options{$_};
};
my $self = bless \%args, (ref($class) || $class);
$self->_verbose("Local::SiteRobot->new : Created new Local::SiteRobot object");
return $self;
}
sub crawl {
my $self = shift;
return undef unless @{$self->{URLS}};
my @pages;
foreach my $url (@{$self->{URLS}}) {
my $uri = URI->new($url);
next unless $uri->scheme;
next unless $uri->scheme eq 'http';
$self->_verbose("Local::SiteRobot->crawl : Crawling from URL ", $uri->canonical->as_string);
push (@pages, $self->_crawl($uri->canonical->as_string));
$self->_verbose("Local::SiteRobot->crawl : Crawling from URL ", $uri->canonical->as_string, " returned ", scalar(@pages), " pages");
}
return @pages;
}
sub _crawl {
my ($self, $url, $depth) = @_;
my @pages;
my $uri = URI->new($url);
$self->_verbose("Local::SiteRobot->_crawl : GET ", $uri->canonical->as_string);
my $html = get($uri->canonical->as_string);
return unless $html;
return $uri->canonical->as_string if ((defined $self->{DEPTH}) && ($self->{DEPTH} == ($depth || 0)));
${$self->{pages}}{$uri->canonical->as_string} = 1;
push (@pages, $uri->canonical->as_string);
my $linkextor = HTML::LinkExtor->new(undef, $uri->canonical->as_string);
$linkextor->parse($html);
foreach my $link ($linkextor->links) {
my ($tag, %attr) = @{$link};
next unless ($tag eq 'a');
next unless (defined $attr{'href'});
my $href = URI->new($attr{'href'});
next unless ($href->canonical->as_string =~ /$self->{FOLLOW_REGEX}/);
next if exists ${$self->{pages}}{$href};
${$self->{pages}}{$href} = 1;
push (@pages, $self->_crawl($href, ($depth || 0) + 1));
}
return @pages;
}
sub _verbose {
my $self = shift;
return unless $self->{VERBOSE};
print STDERR @_, "\n";
}
1;
__END__
</code>
Earlier this month, [George_Sherston] posted a [id://124824|node], where he submitted code for a site indexer and search engine - I took this code and decided to build upon it for my own site and in evaluating it and other options available, I found [cpan://HTML::Index]. This code offered the ability to create site indexes for both local and remote files (through the use of [cpan://WWW::SimpleRobot] by the same author) - This ability for indexing based upon URL was important to me as a great deal of content on the site is dynamic in nature. This was where my journey hit a stumbling block ... [cpan://WWW::SimpleRobot] didn't work!<p>
So, I set about writing my own simplified robot code which had one and only one function - return a list of crawled URLs from a start URL address.<p>
<dl><dd><code>
#!/usr/bin/perl -w
use Local::SiteRobot;
use strict;
my $robot = Local::SiteRobot->new(
DEPTH => 10,
FOLLOW_REGEX => '^http://www.cowsnet.com',
URLS => [ 'http://www.cowsnet.com.au' ]
);
my @pages = $robot->crawl;
print STDOUT $_, "\n" foreach @pages;
</code></dd></dl>
<p>The code I feel is quite self explanatory - <code>/msg</code> me if you have any questions on usage.
Web Stuff
[rob_au]