I still got same error message:
C:\Perl\bin>we3.pl
verifying http://www.domain/mysite.com
Cannot fetch http://www.domain/mysite.com (status 501 Protocol scheme
+'' is not
supported)
I have tried several different URL's that I have access to and still get
the same message when I run this script.
Any other suggestions???
I made changes as suggested:
#!/usr/bin/perl
use lib "/perl/bin";
use LWP::UserAgent;
use HTML::Parser;
use URI::URL;
## begin configure
@CHECK = # list of initial starting points
qw(http://www.domain/mysite.com);
sub PARSE { # verify existance, parse for further URLs
## $_[0] is the absolute URL
$_[0] =~ m!^<http://www>\.(domain)\.com/! and not
$_[0] =~ /refindex/;
}
sub PING { # verify existence, but don't parse
## $_[0] is the absolute URL
$_[0] =~ m!^(http):!;
}
## end configure
{
package ParseLink;
@ISA = qw(HTML::Parser);
sub start { # called by parse
my $this = shift;
my ($tag, $attr) = @_;
if ($tag eq "a") {
$this->{links}{$attr->{href}}++;
} elsif ($tag eq "img") {
$this->{links}{$attr->{src}}++;
}
}
sub get_links {
my $this = shift;
sort keys %{$this->{links}};
}
} # end of ParseLink
$ua = new LWP::UserAgent;
$ua->agent("hverify/1.0");
$ua->env_proxy;
$| = 1;
MAINLOOP:
while ($thisurl = shift @CHECK) {
$thisurl =~ s/%7e/~/ig; # ugh :-)
next if $did{$thisurl}++;
if (PARSE $thisurl) {
warn "fetching $thisurl\n";
$request = HTTP::Request('GET',$thisurl);
$response = $ua->request($request); # fetch!
unless ($response->is_success) {
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
next MAINLOOP unless $response->content_type =~ /text\/html/i;
$base = $response->base;
my $p = ParseLink->new;
$p->parse($response->content);
$p->parse(undef);
for $link ($p->get_links) {
$abs = url($link, $base)->abs;
warn "... $link => $abs\n";
push(@CHECK, $abs);
}
next MAINLOOP;
}
if (PING $thisurl) {
warn "verifying $thisurl\n";
for $method (qw(HEAD GET)) {
$request = new HTTP::Request($method,$thisurl);
$response = $ua->request($request); # fetch!
next MAINLOOP if $response->is_success; # ok
}
warn
"Cannot fetch $thisurl (status ",
$response->code, " ", $response->message,")\n";
next MAINLOOP;
}
warn "[skipping $thisurl]\n";
}
|