ChilliBot Web Crawler

   1: #!/usr/local/bin/perl
   2: 
   3: # This program uses the LWP module to read a web page
   4: # and store its URL, Title, Description and Keywords in
   5: # a database. It then follows the links on the page and
   6: # processes any other pages it finds. The following 
   7: # switches are used to control how the program operates:
   8: # 	/u specify the url to start at
   9: #	/c continue from the last saved restart point
  10: #	/f follow links on page and process pages found
  11: #	/r write next page to process to a restart file
  12: #	/d update the database
  13: #	/l produce logfile of crawl
  14: #   either /u or /c must be specified.
  15: #
  16: # This is my first attempt at a reasonable program in Perl
  17: # so please feel free to criticise to your hearts content
  18: # (its the only way I will learn).
  19: 
  20: use HTML::TreeBuilder;
  21: use LWP::RobotUA;
  22: use URI::URL;
  23: use DBI;
  24: 
  25: # Setup user agent for robot
  26: 
  27: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
  28: 
  29: $ua->delay(1);
  30: 
  31: print $ua->agent;
  32: print "\n";
  33: 
  34: # check if any arguements entered
  35: 
  36: if(scalar(@ARGV) > 0)
  37: {
  38: 	my $startpage="";
  39: 
  40: 	local @visitedpages;
  41: 	local $logtofile = 0;
  42: 	local $drilldown = 0;
  43: 	local $usedatabase = 0;
  44: 	local $setforrestart =0;
  45: 	local $continue=0;
  46: 	local $logfilename;
  47: 
  48: 	$|=1; # set autoflush on for logfile
  49: 
  50: 	# check parameters and set appropriate variables.
  51: 
  52: 	for(my $i=0;$i<scalar(@ARGV);$i++)
  53: 	{
  54: 		if($ARGV[$i] eq "/f")
  55: 		{
  56: 			$drilldown = 1;
  57: 		}
  58: 		if($ARGV[$i] eq "/l")
  59: 		{
  60: 			$logtofile = 1;
  61: 			$logfilename=$ARGV[$i+1];
  62: 		}
  63: 		if($ARGV[$i] eq "/u")
  64: 		{
  65: 			$startpage=$ARGV[$i+1];
  66: 		}
  67: 		if($ARGV[$i] eq "/d")
  68: 		{
  69: 			$usedatabase = 1;
  70: 		}
  71: 		if($ARGV[$i] eq "/r")
  72: 		{
  73: 			$setforrestart = 1;
  74: 		}
  75: 		if($ARGV[$i] eq "/c")
  76: 		{
  77: 			$continue= 1;
  78: 		}
  79: 
  80: 	}
  81: 
  82: 	if($logtofile==1)
  83: 	{
  84: 		open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
  85: 		close(LOGFILE);
  86: 	}
  87: 
  88: 
  89: 	# we do not want to visit pages already visited so keep an array
  90: 	# of their URLs
  91: 
  92: 	@visitedpages=();
  93: 
  94: 	if($usedatabase==1)
  95: 	{
  96: 		# if we are using the database then add all URLs from it
  97: 		# to the list of visited pages.
  98: 
  99: 		print "Building visited pages list...";
 100: 
 101: 		my $DSN = "DSN=PageData";
 102: 
 103: 		my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
 104: 
 105: 		my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
 106: 
 107: 		$sql_handle->execute() || die $dbh->errstr;
 108: 
 109: 		while ( @row = $sql_handle->fetchrow_array() )
 110: 		{
 111: 			push(@visitedpages,$row[0]);	
 112: 		}
 113: 
 114: 		$dbh->disconnect();
 115: 		
 116: 		print "done\n";
 117: 	}
 118: 
 119: 	if($continue==1)
 120: 	{
 121: 		# if we are continuing then find which page to continue from
 122: 
 123: 		open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
 124: 		my @continueeurl=<CONTINUE>;
 125: 		foreach (@continueeurl)
 126: 		{
 127: 			$startpage=$_;
 128: 		}
 129: 		close(CONTINUE);
 130: 	}
 131: 
 132: 	if($startpage ne "")
 133: 	{
 134: 		&gethtml($startpage);
 135: 	}
 136: 
 137: 
 138: 
 139: 
 140: }
 141: else
 142: {
 143: 	# No parameters entered so printout the usage information
 144: 
 145: 	print "Usage:\n";
 146: 	print "    perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
 147: 	print "        where /u - url to start crawl from\n";
 148: 	print "              /f - follow links on each page\n";
 149: 	print "              /d - add page details to database\n";
 150: 	print "              /r - save last accessed url for restart with /c\n";
 151: 	print "              /c - continue from last restart-saved url\n";
 152: 	print "              /l - output to logfile\n\n";
 153: 	print "	       note: either /u or /c must be specified\n\n";
 154: }
 155: print ("Run Complete\n");
 156: 
 157: 
 158: # main routine
 159: 
 160: sub gethtml
 161: {
 162: 
 163: 	my $html;
 164: 	my $treeparse;
 165: 
 166: 	my $rlink;
 167: 	my @linkarray;
 168: 	my $baseurl;
 169: 	my $pagealreadyvisited;
 170: 	my $pagetoprocess;
 171: 	my $rlinkarray;
 172: 
 173: 	local $pagetitle ="";
 174: 	local $pagedescription = "";
 175: 	local $pagekeywords="";
 176: 	local $pagebaseurl="";
 177: 
 178: 	$pagetoprocess = $_[0];
 179: 
 180: 	if($setforrestart==1)
 181: 	{
 182: 		# write URL to restart file.
 183: 
 184: 		open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
 185: 		print RESTARTFILE $pagetoprocess;
 186: 		close(RESTARTFILE);
 187: 	}
 188: 
 189: 	# check we have not already visited this page
 190: 
 191: 	$pagealreadyvisited=0;
 192: 
 193: 	foreach (@visitedpages)
 194: 	{
 195: 		if($_ eq $pagetoprocess)
 196: 		{
 197: 			$pagealreadyvisited=1;
 198: 		}
 199: 
 200: 	}
 201: 
 202: 	if ($pagealreadyvisited == 0)
 203: 	{
 204: 		print "Processing: $_[0]...";
 205: 
 206: 		# request the page
 207: 
 208: 		$request = HTTP::Request->new('GET', $_[0]);		
 209: 		$response = $ua->request($request); 	
 210: 
 211: 		if ($response->is_success) 
 212: 		{		    
 213: 			if($logtofile==1)
 214:      			{
 215: 				open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 216:      				print LOGFILE "Processing: $_[0]...Response OK\n";
 217: 				close(LOGFILE);
 218: 		  	}
 219: 			
 220: 			# parse retrieved HTML
 221: 			
 222: 			@linkarray=();
 223: 			$html=$response->content;
 224: 			$treeparse=HTML::TreeBuilder->new;
 225: 			$treeparse->parse($html);
 226: 			
 227: 			# extract anchor links
 228: 
 229: 			$rlinkarray=$treeparse->extract_links("a");
 230: 
 231: 			# call treewalker function to check meta tags
 232: 
 233: 			$treeparse->traverse(\&treewalker);
 234: 			$treeparse->delete();
 235: 			$pagebaseurl=$response->base;
 236: 
 237: 			if($logtofile==1)
 238: 		     			{
 239: 						open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 240: 		     				print LOGFILE "    Title: $pagetitle\n";
 241: 		     				print LOGFILE "    Description: $pagedescription\n";
 242: 		     				print LOGFILE "    Keywords: $pagekeywords\n";
 243: 						print LOGFILE "    Base URL: $pagebaseurl\n";
 244: 						close(LOGFILE);
 245: 		     			}
 246: 	
 247: 			if($usedatabase==1)
 248: 			{
 249: 
 250: 				# write page details to database
 251: 
 252: 				# DBI::ODBC falls over with any string
 253: 				# longer than 255
 254: 
 255: 				if(length($pagetitle)>255)
 256: 				{
 257: 					$pagetitle=substr($pagetitle,0,255);
 258: 				}
 259: 
 260: 				if(length($pagedescription)>255)
 261: 				{
 262: 					$pagedescription=substr($pagedescription,0,255);
 263: 				}
 264: 				
 265: 				if(length($pagekeywords)>255)
 266: 				{
 267: 					$pagekeywords=substr($pagekeywords,0,255);
 268: 				}
 269: 								
 270: 				my $DSN = "DSN=PageData";
 271: 
 272: 				my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
 273: 
 274: 				my $sql_handle=$dbh->prepare(q{
 275: 						INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
 276: 						}) || die $dbh->errstr;
 277: 
 278: 				$sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
 279: 						    || die $dbh->errstr;
 280: 
 281: 				$dbh->disconnect();
 282: 
 283: 			}
 284: 
 285: 			# add page to visited pages array
 286: 
 287: 			push(@visitedpages,$_[0]);	
 288: 
 289: 			print "OK\n";
 290: 
 291: 			# convert links from a referenced array to
 292: 			# a normal array
 293: 			
 294: 			foreach $rlink(@$rlinkarray)
 295: 			{
 296: 	
 297: 				push(@linkarray,$$rlink[0]);	
 298: 			}
 299: 
 300: 			# create full URLs from links
 301: 
 302: 			$baseurl=$response->base;
 303: 			@linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
 304: 
 305: 			foreach (@linkarray)
 306: 			{
 307: 				if($logtofile==1)
 308: 		     		{
 309: 					open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 310: 		     			print LOGFILE "    $_\n";
 311: 					close(LOGFILE);
 312: 		     		}
 313: 
 314: 			}
 315: 		
 316: 			# put in seperate loop so that printout is correct
 317: 			foreach (@linkarray)
 318: 			{
 319: 				# if link is http and does not contain
 320: 				# any odd charcters then call this function
 321: 				# recursively passing in the link
 322: 
 323: 				if (/http:/i)
 324: 				{
 325: 					if (/[#\@\$]/)
 326: 					{
 327: 					}
 328: 					else
 329: 					{
 330: 						if($drilldown == 1)
 331: 						{
 332: 							&gethtml($_);
 333: 						}
 334: 					}
 335: 				}
 336: 			}
 337: 		} 
 338: 		else 
 339: 		{
 340: 		     print "Failed\n";
 341: 		     if($logtofile==1)
 342:      		     {
 343: 			open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 344:      			print LOGFILE "Processing: $_[0]...Failed\n";
 345: 			close(LOGFILE);
 346: 		     }
 347: 		}
 348: 	}
 349: 
 350: }
 351: 
 352: # Used to find title tag, and description and keyword metatags.
 353: sub treewalker
 354: {
 355: 	my ($node, $start, $depth) = @_;
 356:         if (ref $node) 
 357: 	{
 358:              my $tag = $node->tag;
 359: 
 360: 	     if ($tag eq "meta")
 361: 	     {
 362: 		my $metaname=$node->attr("name");
 363: 		if ($metaname ne "")
 364: 		{
 365: 			if ($metaname=~ /description/i)
 366: 			{
 367: 				my $description=$node->attr("content");
 368: 
 369: 				# remove CR and LF from description.
 370: 				$description =~ s/\n/ /sg;
 371: 				$description =~ s/\r/ /sg;
 372: 				$pagedescription = $description;		
 373: 			}
 374: 			if ($metaname=~ /keywords/i)
 375: 			{
 376: 				my $keywords=$node->attr("content");
 377: 
 378: 				# remove CR and LF from description.
 379: 				$keywords =~ s/\n/ /sg;
 380: 				$keywords =~ s/\r/ /sg;
 381: 				$pagekeywords = $keywords;		
 382: 			}
 383: 		}
 384:      	     }     
 385: 	
 386: 	     if ($tag eq "title" && $pagetitle eq "")
 387: 	     {
 388: 		my $contentnodearray=$node->content;
 389: 		foreach my $contentnode(@$contentnodearray)
 390: 		{
 391: 			if (not ref $contentnode)
 392: 			{
 393: 				$pagetitle=$contentnode;		
 394: 			}
 395: 		}
 396: 	     }
 397: 	
 398: 	
 399: 
 400: 	}
 401: 	return 1; # This makes it recurse through all sub nodes
 402: }
 403:
Back to Craft