Agent00013's URL Checking Spider

Category:	Web Stuff
Author/Contact Info	agent00013AThotmailDOTcom
Description:	This script will spider a domain checking all URLs and outputting status and overall linkage statistics to a file. A number of settings can be modified such as the ability to strip anchors and queries so that dynamic links are ignored.
#!/usr/local/bin/perl -w ############################################################ use strict; use HTML::LinkExtor; #used to extract links use HTTP::Request::Common; use LWP::Simple; #used to check links use LWP::UserAgent; #helps with authorization ### Global Variables ### my (%visitedLinks, %linksToVisit, %linkResults); #declare hashes ### Restricting Variables ### my $restriction = "www.perlmonks.org"; #restricts links to addresses including $restriction my $stripquery = "true"; #true of false, yanks the ?node=whatever,blah=whatever my $stripanchor = "true"; #true of false, yanks the #anchor13 ### some server settings variables ### my $webserver = "http://www.somesite.com/mydir/"; #home web directory my $onserver = "/theserver/mydir/"; #this is where the web address is located on your server... my $localserver = "false"; #if this is true, it will perform some checks # on the server side # using the $webserver and $onserver variables my $dirIndexExt = "index.pl"; #the default page loaded when a directory is requested from the server # i.e. "http://www.perlmonks.org/" # loads "http://www.niwoths.org/index.pl" ### Start Page ### my $startpage = "http://www.perlmonks.org/"; #the page to start checking from -- must be full address # if it's the site name or a directory name, please be sure to include + / ### Site Login Info (if any) ### my %logins = ( ); #for directions on setting up this variable, #please refer to the documentation # "sitename.com"=>"user:pass" ### Log File ### my $linkInfoFile = "linkInfoFile.txt"; #which pages link to what my $updateFreq = 100; #frequency at which link results file is updated ### Control Variables my $sleep = 1; #seconds to pause after displaying output results my $timeout = 30; #timeout in seconds ### Run the Script! ### &main; #run main function &outLinkResults; #when execution finishes, update the results sub main { ############################################################ my($link); $\| =1; #unbuffer output $linksToVisit{$startpage} = &linkFormatter($startpage,"_Beginning_"); #starting from the $startpage, # marked as being linked to from "_Beginning_" while ((scalar keys %linksToVisit) > 0) { #while some left to check foreach (sort keys %linksToVisit) { #foreach link $link = $_; &checkLink($link); #check the link if ($linkResults{$link} eq "OK") { #if the page is OK if ($link =~ /${restriction}/) { if ($link =~ /(\.s?html?\|\.php\|\.(a\|j)sp\|\/$)/) { #it's a web page &grabLinks($link); #grab the links }#end if }#end if }#end if if ((scalar keys %visitedLinks) % $updateFreq == 0) { #update time &outLinkResults; #output link information sleep($sleep); #sleep }#end if } } ############################################################ } sub grabLinks{ ############################################################ my(@links,@values); #declare local variables my($parser,$base_url,$elt_type); my($attr_name,$attr_value,$value); my $ua = LWP::UserAgent->new; $ua->timeout($timeout); #set timeout for UserAgent my $req; $base_url = $_; #grab the page url $req = HTTP::Request::Common::GET($base_url); #the request LINE: foreach (keys %logins) { if ($base_url =~ /$_/) { $req->authorization_basic(split(":",$logins{$_})); last LINE; }#end if }#end GOTO style loop $parser = HTML::LinkExtor->new(undef, $base_url); #initialize parser $parser->parse($ua->request($req)->as_string)->eof; #grab the page @links = $parser->links; #parse it foreach (@links) { $elt_type = shift @$_; #get tag type if ($elt_type =~ /\b(a\|img)\b/) { while (@$_) { ($attr_name, $attr_value) = splice (@$_, 0, 2); if ($attr_value =~ /(https?\|ftp\|file)/) { $attr_value = &linkFormatter($attr_value,$base_url); if(exists $visitedLinks{$attr_value}) { $value = $visitedLinks{$attr_value}; @values = split("::",$value); unless (grep {$base_url =~ /$_/} @values) { $visitedLinks{$attr_value} = join("::",$value,$base_url); } }#end if else { if(exists $linksToVisit{$attr_value}) { $value = $linksToVisit{$attr_value}; @values = split("::",$value); unless (grep {$base_url =~ /$_/} @values) { $linksToVisit{$attr_value} = join("::",$value,$base_url); return; }#end unless } else { #not marked to visit $linksToVisit{$attr_value} = $base_url; } } } #end if } #end while } #end if } #end foreach ############################################################ } sub checkLink { ############################################################ my $link = $_; #store link value locally my $req; #declare local variable print "Checking ${link}:"; my $ua = LWP::UserAgent->new; #create user agent my $results; $req = HTTP::Request::Common::HEAD($link); #grab head by default if($link =~ /(\/cgi-bin\|\.cgi\|\.pl)/) { #if cgi, do a post $req = HTTP::Request::Common::POST($link); } LINE: foreach (keys %logins) { if ($link =~ /$_/) { #if it needs a login $req->authorization_basic(split(":",$logins{$_})); #send in login info last LINE; }#end if }#end foreach (and LINE) $ua->timeout($timeout); $results = $ua->request($req)->as_string; #grab it (head or full page) if ($results =~ /\b200\b/) { #request successfull print "OK\n"; $linkResults{$link} = "OK"; } elsif ($results =~ /\b401\b\|\b403\b/) { #unauthorized or forbid +den print "UNAUTH\n"; $linkResults{$link} = "UNAUTH"; } elsif ($results =~ /\b500\b/) { #internal server error print "SERVERROR\n"; $linkResults{$link} = "SERVERROR"; } else { print "BAD\n"; #404 (file not found) or other $linkResults{$link} = "BAD"; #add it to results } $visitedLinks{$link} = $linksToVisit{$link}; #add it to list delete $linksToVisit{$link}; #do not need to visit again ############################################################ } sub outLinkResults{ ############################################################ my ($key,$link,$value,$status); #local variables my %stats; $stats{"goodpage"} = 0; #init. values (none unassigned) $stats{"badpage"} = 0; $stats{"unauthpage"} = 0; $stats{"serverror"} = 0; $stats{"goodlink"} = 0; $stats{"badlink"} = 0; $stats{"unauthlink"} = 0; my (@links,@keys); print "-------------------------------\n"; print "Updating Link Information file...\n"; #let them know... open(FILE,">$linkInfoFile"); #open link info file print FILE "##########################################\n"; print FILE "# Link File: Tells ya what links where #\n"; print FILE "##########################################\n"; print FILE "# Note: Totals are listed at the bottom. #\n"; print FILE "##########################################\n"; @keys = sort keys %visitedLinks; #grab the keys foreach $key (@keys) { #go through all the keys $value = $visitedLinks{$key}; #grab the value $status = $linkResults{$key}; chomp($status); push(@links,split(/::/,$value)); print FILE "Information for ${key} --\n"; print FILE "Status -- ${status}\n"; if ($status eq "OK") { $stats{"goodpage"}++; $stats{"goodlink"} += scalar(@links); } elsif ($status eq "UNAUTH") { $stats{"unauthpage"}++; $stats{"unauthlink"} += scalar(@links); } elsif ($status eq "SERVERROR") { $stats{"serverror"}++; $stats{"goodlink"} += scalar(@links); } else { $stats{"badpage"}++; $stats{"badlink"} += scalar(@links); } print FILE "Linked to from " . scalar(@links) . " pages: \n"; foreach $link (@links) { print FILE "$link\n"; } print FILE "-----------------------\n"; @links = qw(); } print FILE " TOTALS\n"; #outputing totals print FILE "-----------------------\n"; print FILE " Good pages: " . $stats{"goodpage"} . "\n"; #good + pages print FILE " Bad pages: " . $stats{"badpage"} . "\n"; #bad +pages print FILE " Unauth pages: " . $stats{"unauthpage"} . "\n"; #unau +thorized print FILE "Server errors: " . $stats{"serverror"} . "\n"; #serv. + error print FILE " Total pages: " . ($stats{"goodpage"} + $stats{"badpage"} + + $stats{"serverror"} + $stats{"unauthpage"}) . "\n"; print FILE "-----------------------\n"; print FILE " Good links: " . $stats{"goodlink"} . "\n"; print FILE " Bad links: " . $stats{"badlink"} . "\n"; print FILE " Unauth links: " . $stats{"unauthlink"} . "\n"; print FILE " Total links: " . ($stats{"goodlink"} + $stats{"badlink"} + $stats{"unauthlink"}) . "\n"; print FILE "-----------------------\n"; print " ...completed.\n"; #let 'em know it's done print "Pages checked: " . (scalar keys % visitedLinks) . "\n"; print "Pages remaining: " . (scalar keys %linksToVisit) . "\n"; print "-------------------------------\n"; close(FILE); ############################################################ } sub linkFormatter { ############################################################ my($address,$from,$path); #set local vars $address = shift; #grab address $from = shift; #grab addr. linked from if ($stripanchor eq "true" && ($address =~ /\#/)) { $address =~ s!\#.$!!; #discard anchor } if ($stripquery eq "true" && ($address =~ /\?/)) { $address =~ s!\?.$!!; #discard query } if ($address =~ /\.\.\//) { #if the link includes a relative directory (../something/) #fix it up to the absolute address $from =~ s!/[^/]?/?$!!; #removes subdirectory $address =~ s!./\.\./(.)!${from}/${1}!; #does it } if ($localserver eq "true" && $address =~ /$webserver/) { #if the link is to a directory, add $dirIndexExt #not done to off server links cause we can't touch 'em $path = $address; $path =~ s\|$webserver/?(.)$\|${1}\|; $path = $onserver . $path; if (-d $path) { #if it's a directory $address =~ s!(.*?)/?$!${1}/$dirIndexExt!; #add on extension } } return $address; #return the address ############################################################ } ###T#H#E##E#N#D###

Comment on Agent00013's URL Checking Spider Download Code


good chemistry is complicated, and a little bit messy -LW
	PerlMonks