comment on

0: #!/usr/local/bin/perl
1: 
2: # This program uses the LWP module to read a web page
3: # and store its URL, Title, Description and Keywords in
4: # a database. It then follows the links on the page and
5: # processes any other pages it finds. The following 
6: # switches are used to control how the program operates:
7: # 	/u specify the url to start at
8: #	/c continue from the last saved restart point
9: #	/f follow links on page and process pages found
10: #	/r write next page to process to a restart file
11: #	/d update the database
12: #	/l produce logfile of crawl
13: #   either /u or /c must be specified.
14: #
15: # This is my first attempt at a reasonable program in Perl
16: # so please feel free to criticise to your hearts content
17: # (its the only way I will learn).
18: 
19: use HTML::TreeBuilder;
20: use LWP::RobotUA;
21: use URI::URL;
22: use DBI;
23: 
24: # Setup user agent for robot
25: 
26: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
27: 
28: $ua->delay(1);
29: 
30: print $ua->agent;
31: print "\n";
32: 
33: # check if any arguements entered
34: 
35: if(scalar(@ARGV) > 0)
36: {
37: 	my $startpage="";
38: 
39: 	local @visitedpages;
40: 	local $logtofile = 0;
41: 	local $drilldown = 0;
42: 	local $usedatabase = 0;
43: 	local $setforrestart =0;
44: 	local $continue=0;
45: 	local $logfilename;
46: 
47: 	$|=1; # set autoflush on for logfile
48: 
49: 	# check parameters and set appropriate variables.
50: 
51: 	for(my $i=0;$i<scalar(@ARGV);$i++)
52: 	{
53: 		if($ARGV[$i] eq "/f")
54: 		{
55: 			$drilldown = 1;
56: 		}
57: 		if($ARGV[$i] eq "/l")
58: 		{
59: 			$logtofile = 1;
60: 			$logfilename=$ARGV[$i+1];
61: 		}
62: 		if($ARGV[$i] eq "/u")
63: 		{
64: 			$startpage=$ARGV[$i+1];
65: 		}
66: 		if($ARGV[$i] eq "/d")
67: 		{
68: 			$usedatabase = 1;
69: 		}
70: 		if($ARGV[$i] eq "/r")
71: 		{
72: 			$setforrestart = 1;
73: 		}
74: 		if($ARGV[$i] eq "/c")
75: 		{
76: 			$continue= 1;
77: 		}
78: 
79: 	}
80: 
81: 	if($logtofile==1)
82: 	{
83: 		open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
84: 		close(LOGFILE);
85: 	}
86: 
87: 
88: 	# we do not want to visit pages already visited so keep an array
89: 	# of their URLs
90: 
91: 	@visitedpages=();
92: 
93: 	if($usedatabase==1)
94: 	{
95: 		# if we are using the database then add all URLs from it
96: 		# to the list of visited pages.
97: 
98: 		print "Building visited pages list...";
99: 
100: 		my $DSN = "DSN=PageData";
101: 
102: 		my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
103: 
104: 		my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
105: 
106: 		$sql_handle->execute() || die $dbh->errstr;
107: 
108: 		while ( @row = $sql_handle->fetchrow_array() )
109: 		{
110: 			push(@visitedpages,$row[0]);	
111: 		}
112: 
113: 		$dbh->disconnect();
114: 		
115: 		print "done\n";
116: 	}
117: 
118: 	if($continue==1)
119: 	{
120: 		# if we are continuing then find which page to continue from
121: 
122: 		open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
123: 		my @continueeurl=<CONTINUE>;
124: 		foreach (@continueeurl)
125: 		{
126: 			$startpage=$_;
127: 		}
128: 		close(CONTINUE);
129: 	}
130: 
131: 	if($startpage ne "")
132: 	{
133: 		&gethtml($startpage);
134: 	}
135: 
136: 
137: 
138: 
139: }
140: else
141: {
142: 	# No parameters entered so printout the usage information
143: 
144: 	print "Usage:\n";
145: 	print "    perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
146: 	print "        where /u - url to start crawl from\n";
147: 	print "              /f - follow links on each page\n";
148: 	print "              /d - add page details to database\n";
149: 	print "              /r - save last accessed url for restart with /c\n";
150: 	print "              /c - continue from last restart-saved url\n";
151: 	print "              /l - output to logfile\n\n";
152: 	print "	       note: either /u or /c must be specified\n\n";
153: }
154: print ("Run Complete\n");
155: 
156: 
157: # main routine
158: 
159: sub gethtml
160: {
161: 
162: 	my $html;
163: 	my $treeparse;
164: 
165: 	my $rlink;
166: 	my @linkarray;
167: 	my $baseurl;
168: 	my $pagealreadyvisited;
169: 	my $pagetoprocess;
170: 	my $rlinkarray;
171: 
172: 	local $pagetitle ="";
173: 	local $pagedescription = "";
174: 	local $pagekeywords="";
175: 	local $pagebaseurl="";
176: 
177: 	$pagetoprocess = $_[0];
178: 
179: 	if($setforrestart==1)
180: 	{
181: 		# write URL to restart file.
182: 
183: 		open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
184: 		print RESTARTFILE $pagetoprocess;
185: 		close(RESTARTFILE);
186: 	}
187: 
188: 	# check we have not already visited this page
189: 
190: 	$pagealreadyvisited=0;
191: 
192: 	foreach (@visitedpages)
193: 	{
194: 		if($_ eq $pagetoprocess)
195: 		{
196: 			$pagealreadyvisited=1;
197: 		}
198: 
199: 	}
200: 
201: 	if ($pagealreadyvisited == 0)
202: 	{
203: 		print "Processing: $_[0]...";
204: 
205: 		# request the page
206: 
207: 		$request = HTTP::Request->new('GET', $_[0]);		
208: 		$response = $ua->request($request); 	
209: 
210: 		if ($response->is_success) 
211: 		{		    
212: 			if($logtofile==1)
213:      			{
214: 				open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
215:      				print LOGFILE "Processing: $_[0]...Response OK\n";
216: 				close(LOGFILE);
217: 		  	}
218: 			
219: 			# parse retrieved HTML
220: 			
221: 			@linkarray=();
222: 			$html=$response->content;
223: 			$treeparse=HTML::TreeBuilder->new;
224: 			$treeparse->parse($html);
225: 			
226: 			# extract anchor links
227: 
228: 			$rlinkarray=$treeparse->extract_links("a");
229: 
230: 			# call treewalker function to check meta tags
231: 
232: 			$treeparse->traverse(\&treewalker);
233: 			$treeparse->delete();
234: 			$pagebaseurl=$response->base;
235: 
236: 			if($logtofile==1)
237: 		     			{
238: 						open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
239: 		     				print LOGFILE "    Title: $pagetitle\n";
240: 		     				print LOGFILE "    Description: $pagedescription\n";
241: 		     				print LOGFILE "    Keywords: $pagekeywords\n";
242: 						print LOGFILE "    Base URL: $pagebaseurl\n";
243: 						close(LOGFILE);
244: 		     			}
245: 	
246: 			if($usedatabase==1)
247: 			{
248: 
249: 				# write page details to database
250: 
251: 				# DBI::ODBC falls over with any string
252: 				# longer than 255
253: 
254: 				if(length($pagetitle)>255)
255: 				{
256: 					$pagetitle=substr($pagetitle,0,255);
257: 				}
258: 
259: 				if(length($pagedescription)>255)
260: 				{
261: 					$pagedescription=substr($pagedescription,0,255);
262: 				}
263: 				
264: 				if(length($pagekeywords)>255)
265: 				{
266: 					$pagekeywords=substr($pagekeywords,0,255);
267: 				}
268: 								
269: 				my $DSN = "DSN=PageData";
270: 
271: 				my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
272: 
273: 				my $sql_handle=$dbh->prepare(q{
274: 						INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
275: 						}) || die $dbh->errstr;
276: 
277: 				$sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
278: 						    || die $dbh->errstr;
279: 
280: 				$dbh->disconnect();
281: 
282: 			}
283: 
284: 			# add page to visited pages array
285: 
286: 			push(@visitedpages,$_[0]);	
287: 
288: 			print "OK\n";
289: 
290: 			# convert links from a referenced array to
291: 			# a normal array
292: 			
293: 			foreach $rlink(@$rlinkarray)
294: 			{
295: 	
296: 				push(@linkarray,$$rlink[0]);	
297: 			}
298: 
299: 			# create full URLs from links
300: 
301: 			$baseurl=$response->base;
302: 			@linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
303: 
304: 			foreach (@linkarray)
305: 			{
306: 				if($logtofile==1)
307: 		     		{
308: 					open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
309: 		     			print LOGFILE "    $_\n";
310: 					close(LOGFILE);
311: 		     		}
312: 
313: 			}
314: 		
315: 			# put in seperate loop so that printout is correct
316: 			foreach (@linkarray)
317: 			{
318: 				# if link is http and does not contain
319: 				# any odd charcters then call this function
320: 				# recursively passing in the link
321: 
322: 				if (/http:/i)
323: 				{
324: 					if (/[#\@\$]/)
325: 					{
326: 					}
327: 					else
328: 					{
329: 						if($drilldown == 1)
330: 						{
331: 							&gethtml($_);
332: 						}
333: 					}
334: 				}
335: 			}
336: 		} 
337: 		else 
338: 		{
339: 		     print "Failed\n";
340: 		     if($logtofile==1)
341:      		     {
342: 			open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
343:      			print LOGFILE "Processing: $_[0]...Failed\n";
344: 			close(LOGFILE);
345: 		     }
346: 		}
347: 	}
348: 
349: }
350: 
351: # Used to find title tag, and description and keyword metatags.
352: sub treewalker
353: {
354: 	my ($node, $start, $depth) = @_;
355:         if (ref $node) 
356: 	{
357:              my $tag = $node->tag;
358: 
359: 	     if ($tag eq "meta")
360: 	     {
361: 		my $metaname=$node->attr("name");
362: 		if ($metaname ne "")
363: 		{
364: 			if ($metaname=~ /description/i)
365: 			{
366: 				my $description=$node->attr("content");
367: 
368: 				# remove CR and LF from description.
369: 				$description =~ s/\n/ /sg;
370: 				$description =~ s/\r/ /sg;
371: 				$pagedescription = $description;		
372: 			}
373: 			if ($metaname=~ /keywords/i)
374: 			{
375: 				my $keywords=$node->attr("content");
376: 
377: 				# remove CR and LF from description.
378: 				$keywords =~ s/\n/ /sg;
379: 				$keywords =~ s/\r/ /sg;
380: 				$pagekeywords = $keywords;		
381: 			}
382: 		}
383:      	     }     
384: 	
385: 	     if ($tag eq "title" && $pagetitle eq "")
386: 	     {
387: 		my $contentnodearray=$node->content;
388: 		foreach my $contentnode(@$contentnodearray)
389: 		{
390: 			if (not ref $contentnode)
391: 			{
392: 				$pagetitle=$contentnode;		
393: 			}
394: 		}
395: 	     }
396: 	
397: 	
398: 
399: 	}
400: 	return 1; # This makes it recurse through all sub nodes
401: }
402:
In reply to ChilliBot Web Crawler by ChilliHead
Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.

Perl Monk, Perl Meditation
	PerlMonks