1: #!/usr/local/bin/perl
2:
3: # This program uses the LWP module to read a web page
4: # and store its URL, Title, Description and Keywords in
5: # a database. It then follows the links on the page and
6: # processes any other pages it finds. The following
7: # switches are used to control how the program operates:
8: # /u specify the url to start at
9: # /c continue from the last saved restart point
10: # /f follow links on page and process pages found
11: # /r write next page to process to a restart file
12: # /d update the database
13: # /l produce logfile of crawl
14: # either /u or /c must be specified.
15: #
16: # This is my first attempt at a reasonable program in Perl
17: # so please feel free to criticise to your hearts content
18: # (its the only way I will learn).
19:
20: use HTML::TreeBuilder;
21: use LWP::RobotUA;
22: use URI::URL;
23: use DBI;
24:
25: # Setup user agent for robot
26:
27: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
28:
29: $ua->delay(1);
30:
31: print $ua->agent;
32: print "\n";
33:
34: # check if any arguements entered
35:
36: if(scalar(@ARGV) > 0)
37: {
38: my $startpage="";
39:
40: local @visitedpages;
41: local $logtofile = 0;
42: local $drilldown = 0;
43: local $usedatabase = 0;
44: local $setforrestart =0;
45: local $continue=0;
46: local $logfilename;
47:
48: $|=1; # set autoflush on for logfile
49:
50: # check parameters and set appropriate variables.
51:
52: for(my $i=0;$i<scalar(@ARGV);$i++)
53: {
54: if($ARGV[$i] eq "/f")
55: {
56: $drilldown = 1;
57: }
58: if($ARGV[$i] eq "/l")
59: {
60: $logtofile = 1;
61: $logfilename=$ARGV[$i+1];
62: }
63: if($ARGV[$i] eq "/u")
64: {
65: $startpage=$ARGV[$i+1];
66: }
67: if($ARGV[$i] eq "/d")
68: {
69: $usedatabase = 1;
70: }
71: if($ARGV[$i] eq "/r")
72: {
73: $setforrestart = 1;
74: }
75: if($ARGV[$i] eq "/c")
76: {
77: $continue= 1;
78: }
79:
80: }
81:
82: if($logtofile==1)
83: {
84: open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
85: close(LOGFILE);
86: }
87:
88:
89: # we do not want to visit pages already visited so keep an array
90: # of their URLs
91:
92: @visitedpages=();
93:
94: if($usedatabase==1)
95: {
96: # if we are using the database then add all URLs from it
97: # to the list of visited pages.
98:
99: print "Building visited pages list...";
100:
101: my $DSN = "DSN=PageData";
102:
103: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
104:
105: my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
106:
107: $sql_handle->execute() || die $dbh->errstr;
108:
109: while ( @row = $sql_handle->fetchrow_array() )
110: {
111: push(@visitedpages,$row[0]);
112: }
113:
114: $dbh->disconnect();
115:
116: print "done\n";
117: }
118:
119: if($continue==1)
120: {
121: # if we are continuing then find which page to continue from
122:
123: open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
124: my @continueeurl=<CONTINUE>;
125: foreach (@continueeurl)
126: {
127: $startpage=$_;
128: }
129: close(CONTINUE);
130: }
131:
132: if($startpage ne "")
133: {
134: &gethtml($startpage);
135: }
136:
137:
138:
139:
140: }
141: else
142: {
143: # No parameters entered so printout the usage information
144:
145: print "Usage:\n";
146: print " perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
147: print " where /u - url to start crawl from\n";
148: print " /f - follow links on each page\n";
149: print " /d - add page details to database\n";
150: print " /r - save last accessed url for restart with /c\n";
151: print " /c - continue from last restart-saved url\n";
152: print " /l - output to logfile\n\n";
153: print " note: either /u or /c must be specified\n\n";
154: }
155: print ("Run Complete\n");
156:
157:
158: # main routine
159:
160: sub gethtml
161: {
162:
163: my $html;
164: my $treeparse;
165:
166: my $rlink;
167: my @linkarray;
168: my $baseurl;
169: my $pagealreadyvisited;
170: my $pagetoprocess;
171: my $rlinkarray;
172:
173: local $pagetitle ="";
174: local $pagedescription = "";
175: local $pagekeywords="";
176: local $pagebaseurl="";
177:
178: $pagetoprocess = $_[0];
179:
180: if($setforrestart==1)
181: {
182: # write URL to restart file.
183:
184: open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
185: print RESTARTFILE $pagetoprocess;
186: close(RESTARTFILE);
187: }
188:
189: # check we have not already visited this page
190:
191: $pagealreadyvisited=0;
192:
193: foreach (@visitedpages)
194: {
195: if($_ eq $pagetoprocess)
196: {
197: $pagealreadyvisited=1;
198: }
199:
200: }
201:
202: if ($pagealreadyvisited == 0)
203: {
204: print "Processing: $_[0]...";
205:
206: # request the page
207:
208: $request = HTTP::Request->new('GET', $_[0]);
209: $response = $ua->request($request);
210:
211: if ($response->is_success)
212: {
213: if($logtofile==1)
214: {
215: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
216: print LOGFILE "Processing: $_[0]...Response OK\n";
217: close(LOGFILE);
218: }
219:
220: # parse retrieved HTML
221:
222: @linkarray=();
223: $html=$response->content;
224: $treeparse=HTML::TreeBuilder->new;
225: $treeparse->parse($html);
226:
227: # extract anchor links
228:
229: $rlinkarray=$treeparse->extract_links("a");
230:
231: # call treewalker function to check meta tags
232:
233: $treeparse->traverse(\&treewalker);
234: $treeparse->delete();
235: $pagebaseurl=$response->base;
236:
237: if($logtofile==1)
238: {
239: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
240: print LOGFILE " Title: $pagetitle\n";
241: print LOGFILE " Description: $pagedescription\n";
242: print LOGFILE " Keywords: $pagekeywords\n";
243: print LOGFILE " Base URL: $pagebaseurl\n";
244: close(LOGFILE);
245: }
246:
247: if($usedatabase==1)
248: {
249:
250: # write page details to database
251:
252: # DBI::ODBC falls over with any string
253: # longer than 255
254:
255: if(length($pagetitle)>255)
256: {
257: $pagetitle=substr($pagetitle,0,255);
258: }
259:
260: if(length($pagedescription)>255)
261: {
262: $pagedescription=substr($pagedescription,0,255);
263: }
264:
265: if(length($pagekeywords)>255)
266: {
267: $pagekeywords=substr($pagekeywords,0,255);
268: }
269:
270: my $DSN = "DSN=PageData";
271:
272: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
273:
274: my $sql_handle=$dbh->prepare(q{
275: INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
276: }) || die $dbh->errstr;
277:
278: $sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
279: || die $dbh->errstr;
280:
281: $dbh->disconnect();
282:
283: }
284:
285: # add page to visited pages array
286:
287: push(@visitedpages,$_[0]);
288:
289: print "OK\n";
290:
291: # convert links from a referenced array to
292: # a normal array
293:
294: foreach $rlink(@$rlinkarray)
295: {
296:
297: push(@linkarray,$$rlink[0]);
298: }
299:
300: # create full URLs from links
301:
302: $baseurl=$response->base;
303: @linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
304:
305: foreach (@linkarray)
306: {
307: if($logtofile==1)
308: {
309: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
310: print LOGFILE " $_\n";
311: close(LOGFILE);
312: }
313:
314: }
315:
316: # put in seperate loop so that printout is correct
317: foreach (@linkarray)
318: {
319: # if link is http and does not contain
320: # any odd charcters then call this function
321: # recursively passing in the link
322:
323: if (/http:/i)
324: {
325: if (/[#\@\$]/)
326: {
327: }
328: else
329: {
330: if($drilldown == 1)
331: {
332: &gethtml($_);
333: }
334: }
335: }
336: }
337: }
338: else
339: {
340: print "Failed\n";
341: if($logtofile==1)
342: {
343: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
344: print LOGFILE "Processing: $_[0]...Failed\n";
345: close(LOGFILE);
346: }
347: }
348: }
349:
350: }
351:
352: # Used to find title tag, and description and keyword metatags.
353: sub treewalker
354: {
355: my ($node, $start, $depth) = @_;
356: if (ref $node)
357: {
358: my $tag = $node->tag;
359:
360: if ($tag eq "meta")
361: {
362: my $metaname=$node->attr("name");
363: if ($metaname ne "")
364: {
365: if ($metaname=~ /description/i)
366: {
367: my $description=$node->attr("content");
368:
369: # remove CR and LF from description.
370: $description =~ s/\n/ /sg;
371: $description =~ s/\r/ /sg;
372: $pagedescription = $description;
373: }
374: if ($metaname=~ /keywords/i)
375: {
376: my $keywords=$node->attr("content");
377:
378: # remove CR and LF from description.
379: $keywords =~ s/\n/ /sg;
380: $keywords =~ s/\r/ /sg;
381: $pagekeywords = $keywords;
382: }
383: }
384: }
385:
386: if ($tag eq "title" && $pagetitle eq "")
387: {
388: my $contentnodearray=$node->content;
389: foreach my $contentnode(@$contentnodearray)
390: {
391: if (not ref $contentnode)
392: {
393: $pagetitle=$contentnode;
394: }
395: }
396: }
397:
398:
399:
400: }
401: return 1; # This makes it recurse through all sub nodes
402: }
403: