Beefy Boxes and Bandwidth Generously Provided by pair Networks
Do you know where your variables are?
 
PerlMonks  

ChilliBot Web Crawler

by ChilliHead (Monk)
on Jan 02, 2001 at 14:56 UTC ( #49276=perlcraft: print w/replies, xml ) Need Help??

   1: #!/usr/local/bin/perl
   2: 
   3: # This program uses the LWP module to read a web page
   4: # and store its URL, Title, Description and Keywords in
   5: # a database. It then follows the links on the page and
   6: # processes any other pages it finds. The following 
   7: # switches are used to control how the program operates:
   8: # 	/u specify the url to start at
   9: #	/c continue from the last saved restart point
  10: #	/f follow links on page and process pages found
  11: #	/r write next page to process to a restart file
  12: #	/d update the database
  13: #	/l produce logfile of crawl
  14: #   either /u or /c must be specified.
  15: #
  16: # This is my first attempt at a reasonable program in Perl
  17: # so please feel free to criticise to your hearts content
  18: # (its the only way I will learn).
  19: 
  20: use HTML::TreeBuilder;
  21: use LWP::RobotUA;
  22: use URI::URL;
  23: use DBI;
  24: 
  25: # Setup user agent for robot
  26: 
  27: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
  28: 
  29: $ua->delay(1);
  30: 
  31: print $ua->agent;
  32: print "\n";
  33: 
  34: # check if any arguements entered
  35: 
  36: if(scalar(@ARGV) > 0)
  37: {
  38: 	my $startpage="";
  39: 
  40: 	local @visitedpages;
  41: 	local $logtofile = 0;
  42: 	local $drilldown = 0;
  43: 	local $usedatabase = 0;
  44: 	local $setforrestart =0;
  45: 	local $continue=0;
  46: 	local $logfilename;
  47: 
  48: 	$|=1; # set autoflush on for logfile
  49: 
  50: 	# check parameters and set appropriate variables.
  51: 
  52: 	for(my $i=0;$i<scalar(@ARGV);$i++)
  53: 	{
  54: 		if($ARGV[$i] eq "/f")
  55: 		{
  56: 			$drilldown = 1;
  57: 		}
  58: 		if($ARGV[$i] eq "/l")
  59: 		{
  60: 			$logtofile = 1;
  61: 			$logfilename=$ARGV[$i+1];
  62: 		}
  63: 		if($ARGV[$i] eq "/u")
  64: 		{
  65: 			$startpage=$ARGV[$i+1];
  66: 		}
  67: 		if($ARGV[$i] eq "/d")
  68: 		{
  69: 			$usedatabase = 1;
  70: 		}
  71: 		if($ARGV[$i] eq "/r")
  72: 		{
  73: 			$setforrestart = 1;
  74: 		}
  75: 		if($ARGV[$i] eq "/c")
  76: 		{
  77: 			$continue= 1;
  78: 		}
  79: 
  80: 	}
  81: 
  82: 	if($logtofile==1)
  83: 	{
  84: 		open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
  85: 		close(LOGFILE);
  86: 	}
  87: 
  88: 
  89: 	# we do not want to visit pages already visited so keep an array
  90: 	# of their URLs
  91: 
  92: 	@visitedpages=();
  93: 
  94: 	if($usedatabase==1)
  95: 	{
  96: 		# if we are using the database then add all URLs from it
  97: 		# to the list of visited pages.
  98: 
  99: 		print "Building visited pages list...";
 100: 
 101: 		my $DSN = "DSN=PageData";
 102: 
 103: 		my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
 104: 
 105: 		my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
 106: 
 107: 		$sql_handle->execute() || die $dbh->errstr;
 108: 
 109: 		while ( @row = $sql_handle->fetchrow_array() )
 110: 		{
 111: 			push(@visitedpages,$row[0]);	
 112: 		}
 113: 
 114: 		$dbh->disconnect();
 115: 		
 116: 		print "done\n";
 117: 	}
 118: 
 119: 	if($continue==1)
 120: 	{
 121: 		# if we are continuing then find which page to continue from
 122: 
 123: 		open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
 124: 		my @continueeurl=<CONTINUE>;
 125: 		foreach (@continueeurl)
 126: 		{
 127: 			$startpage=$_;
 128: 		}
 129: 		close(CONTINUE);
 130: 	}
 131: 
 132: 	if($startpage ne "")
 133: 	{
 134: 		&gethtml($startpage);
 135: 	}
 136: 
 137: 
 138: 
 139: 
 140: }
 141: else
 142: {
 143: 	# No parameters entered so printout the usage information
 144: 
 145: 	print "Usage:\n";
 146: 	print "    perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
 147: 	print "        where /u - url to start crawl from\n";
 148: 	print "              /f - follow links on each page\n";
 149: 	print "              /d - add page details to database\n";
 150: 	print "              /r - save last accessed url for restart with /c\n";
 151: 	print "              /c - continue from last restart-saved url\n";
 152: 	print "              /l - output to logfile\n\n";
 153: 	print "	       note: either /u or /c must be specified\n\n";
 154: }
 155: print ("Run Complete\n");
 156: 
 157: 
 158: # main routine
 159: 
 160: sub gethtml
 161: {
 162: 
 163: 	my $html;
 164: 	my $treeparse;
 165: 
 166: 	my $rlink;
 167: 	my @linkarray;
 168: 	my $baseurl;
 169: 	my $pagealreadyvisited;
 170: 	my $pagetoprocess;
 171: 	my $rlinkarray;
 172: 
 173: 	local $pagetitle ="";
 174: 	local $pagedescription = "";
 175: 	local $pagekeywords="";
 176: 	local $pagebaseurl="";
 177: 
 178: 	$pagetoprocess = $_[0];
 179: 
 180: 	if($setforrestart==1)
 181: 	{
 182: 		# write URL to restart file.
 183: 
 184: 		open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
 185: 		print RESTARTFILE $pagetoprocess;
 186: 		close(RESTARTFILE);
 187: 	}
 188: 
 189: 	# check we have not already visited this page
 190: 
 191: 	$pagealreadyvisited=0;
 192: 
 193: 	foreach (@visitedpages)
 194: 	{
 195: 		if($_ eq $pagetoprocess)
 196: 		{
 197: 			$pagealreadyvisited=1;
 198: 		}
 199: 
 200: 	}
 201: 
 202: 	if ($pagealreadyvisited == 0)
 203: 	{
 204: 		print "Processing: $_[0]...";
 205: 
 206: 		# request the page
 207: 
 208: 		$request = HTTP::Request->new('GET', $_[0]);		
 209: 		$response = $ua->request($request); 	
 210: 
 211: 		if ($response->is_success) 
 212: 		{		    
 213: 			if($logtofile==1)
 214:      			{
 215: 				open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 216:      				print LOGFILE "Processing: $_[0]...Response OK\n";
 217: 				close(LOGFILE);
 218: 		  	}
 219: 			
 220: 			# parse retrieved HTML
 221: 			
 222: 			@linkarray=();
 223: 			$html=$response->content;
 224: 			$treeparse=HTML::TreeBuilder->new;
 225: 			$treeparse->parse($html);
 226: 			
 227: 			# extract anchor links
 228: 
 229: 			$rlinkarray=$treeparse->extract_links("a");
 230: 
 231: 			# call treewalker function to check meta tags
 232: 
 233: 			$treeparse->traverse(\&treewalker);
 234: 			$treeparse->delete();
 235: 			$pagebaseurl=$response->base;
 236: 
 237: 			if($logtofile==1)
 238: 		     			{
 239: 						open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 240: 		     				print LOGFILE "    Title: $pagetitle\n";
 241: 		     				print LOGFILE "    Description: $pagedescription\n";
 242: 		     				print LOGFILE "    Keywords: $pagekeywords\n";
 243: 						print LOGFILE "    Base URL: $pagebaseurl\n";
 244: 						close(LOGFILE);
 245: 		     			}
 246: 	
 247: 			if($usedatabase==1)
 248: 			{
 249: 
 250: 				# write page details to database
 251: 
 252: 				# DBI::ODBC falls over with any string
 253: 				# longer than 255
 254: 
 255: 				if(length($pagetitle)>255)
 256: 				{
 257: 					$pagetitle=substr($pagetitle,0,255);
 258: 				}
 259: 
 260: 				if(length($pagedescription)>255)
 261: 				{
 262: 					$pagedescription=substr($pagedescription,0,255);
 263: 				}
 264: 				
 265: 				if(length($pagekeywords)>255)
 266: 				{
 267: 					$pagekeywords=substr($pagekeywords,0,255);
 268: 				}
 269: 								
 270: 				my $DSN = "DSN=PageData";
 271: 
 272: 				my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
 273: 
 274: 				my $sql_handle=$dbh->prepare(q{
 275: 						INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
 276: 						}) || die $dbh->errstr;
 277: 
 278: 				$sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
 279: 						    || die $dbh->errstr;
 280: 
 281: 				$dbh->disconnect();
 282: 
 283: 			}
 284: 
 285: 			# add page to visited pages array
 286: 
 287: 			push(@visitedpages,$_[0]);	
 288: 
 289: 			print "OK\n";
 290: 
 291: 			# convert links from a referenced array to
 292: 			# a normal array
 293: 			
 294: 			foreach $rlink(@$rlinkarray)
 295: 			{
 296: 	
 297: 				push(@linkarray,$$rlink[0]);	
 298: 			}
 299: 
 300: 			# create full URLs from links
 301: 
 302: 			$baseurl=$response->base;
 303: 			@linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
 304: 
 305: 			foreach (@linkarray)
 306: 			{
 307: 				if($logtofile==1)
 308: 		     		{
 309: 					open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 310: 		     			print LOGFILE "    $_\n";
 311: 					close(LOGFILE);
 312: 		     		}
 313: 
 314: 			}
 315: 		
 316: 			# put in seperate loop so that printout is correct
 317: 			foreach (@linkarray)
 318: 			{
 319: 				# if link is http and does not contain
 320: 				# any odd charcters then call this function
 321: 				# recursively passing in the link
 322: 
 323: 				if (/http:/i)
 324: 				{
 325: 					if (/[#\@\$]/)
 326: 					{
 327: 					}
 328: 					else
 329: 					{
 330: 						if($drilldown == 1)
 331: 						{
 332: 							&gethtml($_);
 333: 						}
 334: 					}
 335: 				}
 336: 			}
 337: 		} 
 338: 		else 
 339: 		{
 340: 		     print "Failed\n";
 341: 		     if($logtofile==1)
 342:      		     {
 343: 			open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
 344:      			print LOGFILE "Processing: $_[0]...Failed\n";
 345: 			close(LOGFILE);
 346: 		     }
 347: 		}
 348: 	}
 349: 
 350: }
 351: 
 352: # Used to find title tag, and description and keyword metatags.
 353: sub treewalker
 354: {
 355: 	my ($node, $start, $depth) = @_;
 356:         if (ref $node) 
 357: 	{
 358:              my $tag = $node->tag;
 359: 
 360: 	     if ($tag eq "meta")
 361: 	     {
 362: 		my $metaname=$node->attr("name");
 363: 		if ($metaname ne "")
 364: 		{
 365: 			if ($metaname=~ /description/i)
 366: 			{
 367: 				my $description=$node->attr("content");
 368: 
 369: 				# remove CR and LF from description.
 370: 				$description =~ s/\n/ /sg;
 371: 				$description =~ s/\r/ /sg;
 372: 				$pagedescription = $description;		
 373: 			}
 374: 			if ($metaname=~ /keywords/i)
 375: 			{
 376: 				my $keywords=$node->attr("content");
 377: 
 378: 				# remove CR and LF from description.
 379: 				$keywords =~ s/\n/ /sg;
 380: 				$keywords =~ s/\r/ /sg;
 381: 				$pagekeywords = $keywords;		
 382: 			}
 383: 		}
 384:      	     }     
 385: 	
 386: 	     if ($tag eq "title" && $pagetitle eq "")
 387: 	     {
 388: 		my $contentnodearray=$node->content;
 389: 		foreach my $contentnode(@$contentnodearray)
 390: 		{
 391: 			if (not ref $contentnode)
 392: 			{
 393: 				$pagetitle=$contentnode;		
 394: 			}
 395: 		}
 396: 	     }
 397: 	
 398: 	
 399: 
 400: 	}
 401: 	return 1; # This makes it recurse through all sub nodes
 402: }
 403:  

Replies are listed 'Best First'.
Re: ChilliBot Web Crawler
by ichimunki (Priest) on Jan 02, 2001 at 20:58 UTC
    This is a great start. Some off-the-cuff suggestions:

    use GetOpts;

    Think about "my" vs. "local".

    You have at least one if statement where the TRUE result is an empty block, you might want to look at unless as an alternative.

    Of course, it's easy for me to say these things, I didn't have to do all the hard work of writing it. I just get to comment on it. {grin}
Re: ChilliBot Web Crawler
by dws (Chancellor) on Jan 02, 2001 at 23:36 UTC
    If you're intent on rolling your own, this is a reasonably good start. A couple of suggestions:

    1. Add a Configuration section. Stuff like DB connect strings belong there.

        my $odbcDSN = "PageData"
        my $dbConnectString = "dbi:ODBC:DSN=$odbcDSN";
    
    This way, if someone wants to retarget your code to some other DBMS, they may be able to get by with only changes to the Configuration section.

    2. By embedding the "usage" message so far down in the code, you increase the risk that when you add a new option, you forget to change the message. Better to either have a usage() routine at the top (where it also serves as documentation, or as part of the POD documentation. I've seen a couple of scripts that open themselves and extract the usage message from their own POD.

    3. gethtml() is rather large. Look at the casually-related things that're done there, and consider ways to break them out into separate subroutines.

    4. Note that many sites take defensive measures against being sucked dry by robots. At the very least, it is considered polite to stagger successive requests to the same site by 60 seconds.

Re: ChilliBot Web Crawler
by Anonymous Monk on Feb 06, 2001 at 22:14 UTC
    My only suggestion would be to include notes as to how to create the database and tables it would need with the /d switch. The only other thing I can think of was that it did not like me using a file:// url, but alas, I can work around that. Thank you for posting the code. -Albert C.
Re: ChilliBot Web Crawler
by Anonymous Monk on Sep 15, 2016 at 01:13 UTC
    I'm not sure how you enable the switch. ie /c So far it's not working...

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: perlcraft [id://49276]
Approved by root
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others pondering the Monastery: (2)
As of 2021-11-27 09:45 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found

    Notices?