0: #!/usr/local/bin/perl
1:
2: # This program uses the LWP module to read a web page
3: # and store its URL, Title, Description and Keywords in
4: # a database. It then follows the links on the page and
5: # processes any other pages it finds. The following
6: # switches are used to control how the program operates:
7: # /u specify the url to start at
8: # /c continue from the last saved restart point
9: # /f follow links on page and process pages found
10: # /r write next page to process to a restart file
11: # /d update the database
12: # /l produce logfile of crawl
13: # either /u or /c must be specified.
14: #
15: # This is my first attempt at a reasonable program in Perl
16: # so please feel free to criticise to your hearts content
17: # (its the only way I will learn).
18:
19: use HTML::TreeBuilder;
20: use LWP::RobotUA;
21: use URI::URL;
22: use DBI;
23:
24: # Setup user agent for robot
25:
26: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
27:
28: $ua->delay(1);
29:
30: print $ua->agent;
31: print "\n";
32:
33: # check if any arguements entered
34:
35: if(scalar(@ARGV) > 0)
36: {
37: my $startpage="";
38:
39: local @visitedpages;
40: local $logtofile = 0;
41: local $drilldown = 0;
42: local $usedatabase = 0;
43: local $setforrestart =0;
44: local $continue=0;
45: local $logfilename;
46:
47: $|=1; # set autoflush on for logfile
48:
49: # check parameters and set appropriate variables.
50:
51: for(my $i=0;$i<scalar(@ARGV);$i++)
52: {
53: if($ARGV[$i] eq "/f")
54: {
55: $drilldown = 1;
56: }
57: if($ARGV[$i] eq "/l")
58: {
59: $logtofile = 1;
60: $logfilename=$ARGV[$i+1];
61: }
62: if($ARGV[$i] eq "/u")
63: {
64: $startpage=$ARGV[$i+1];
65: }
66: if($ARGV[$i] eq "/d")
67: {
68: $usedatabase = 1;
69: }
70: if($ARGV[$i] eq "/r")
71: {
72: $setforrestart = 1;
73: }
74: if($ARGV[$i] eq "/c")
75: {
76: $continue= 1;
77: }
78:
79: }
80:
81: if($logtofile==1)
82: {
83: open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
84: close(LOGFILE);
85: }
86:
87:
88: # we do not want to visit pages already visited so keep an array
89: # of their URLs
90:
91: @visitedpages=();
92:
93: if($usedatabase==1)
94: {
95: # if we are using the database then add all URLs from it
96: # to the list of visited pages.
97:
98: print "Building visited pages list...";
99:
100: my $DSN = "DSN=PageData";
101:
102: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
103:
104: my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
105:
106: $sql_handle->execute() || die $dbh->errstr;
107:
108: while ( @row = $sql_handle->fetchrow_array() )
109: {
110: push(@visitedpages,$row[0]);
111: }
112:
113: $dbh->disconnect();
114:
115: print "done\n";
116: }
117:
118: if($continue==1)
119: {
120: # if we are continuing then find which page to continue from
121:
122: open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
123: my @continueeurl=<CONTINUE>;
124: foreach (@continueeurl)
125: {
126: $startpage=$_;
127: }
128: close(CONTINUE);
129: }
130:
131: if($startpage ne "")
132: {
133: &gethtml($startpage);
134: }
135:
136:
137:
138:
139: }
140: else
141: {
142: # No parameters entered so printout the usage information
143:
144: print "Usage:\n";
145: print " perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
146: print " where /u - url to start crawl from\n";
147: print " /f - follow links on each page\n";
148: print " /d - add page details to database\n";
149: print " /r - save last accessed url for restart with /c\n";
150: print " /c - continue from last restart-saved url\n";
151: print " /l - output to logfile\n\n";
152: print " note: either /u or /c must be specified\n\n";
153: }
154: print ("Run Complete\n");
155:
156:
157: # main routine
158:
159: sub gethtml
160: {
161:
162: my $html;
163: my $treeparse;
164:
165: my $rlink;
166: my @linkarray;
167: my $baseurl;
168: my $pagealreadyvisited;
169: my $pagetoprocess;
170: my $rlinkarray;
171:
172: local $pagetitle ="";
173: local $pagedescription = "";
174: local $pagekeywords="";
175: local $pagebaseurl="";
176:
177: $pagetoprocess = $_[0];
178:
179: if($setforrestart==1)
180: {
181: # write URL to restart file.
182:
183: open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
184: print RESTARTFILE $pagetoprocess;
185: close(RESTARTFILE);
186: }
187:
188: # check we have not already visited this page
189:
190: $pagealreadyvisited=0;
191:
192: foreach (@visitedpages)
193: {
194: if($_ eq $pagetoprocess)
195: {
196: $pagealreadyvisited=1;
197: }
198:
199: }
200:
201: if ($pagealreadyvisited == 0)
202: {
203: print "Processing: $_[0]...";
204:
205: # request the page
206:
207: $request = HTTP::Request->new('GET', $_[0]);
208: $response = $ua->request($request);
209:
210: if ($response->is_success)
211: {
212: if($logtofile==1)
213: {
214: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
215: print LOGFILE "Processing: $_[0]...Response OK\n";
216: close(LOGFILE);
217: }
218:
219: # parse retrieved HTML
220:
221: @linkarray=();
222: $html=$response->content;
223: $treeparse=HTML::TreeBuilder->new;
224: $treeparse->parse($html);
225:
226: # extract anchor links
227:
228: $rlinkarray=$treeparse->extract_links("a");
229:
230: # call treewalker function to check meta tags
231:
232: $treeparse->traverse(\&treewalker);
233: $treeparse->delete();
234: $pagebaseurl=$response->base;
235:
236: if($logtofile==1)
237: {
238: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
239: print LOGFILE " Title: $pagetitle\n";
240: print LOGFILE " Description: $pagedescription\n";
241: print LOGFILE " Keywords: $pagekeywords\n";
242: print LOGFILE " Base URL: $pagebaseurl\n";
243: close(LOGFILE);
244: }
245:
246: if($usedatabase==1)
247: {
248:
249: # write page details to database
250:
251: # DBI::ODBC falls over with any string
252: # longer than 255
253:
254: if(length($pagetitle)>255)
255: {
256: $pagetitle=substr($pagetitle,0,255);
257: }
258:
259: if(length($pagedescription)>255)
260: {
261: $pagedescription=substr($pagedescription,0,255);
262: }
263:
264: if(length($pagekeywords)>255)
265: {
266: $pagekeywords=substr($pagekeywords,0,255);
267: }
268:
269: my $DSN = "DSN=PageData";
270:
271: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
272:
273: my $sql_handle=$dbh->prepare(q{
274: INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
275: }) || die $dbh->errstr;
276:
277: $sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
278: || die $dbh->errstr;
279:
280: $dbh->disconnect();
281:
282: }
283:
284: # add page to visited pages array
285:
286: push(@visitedpages,$_[0]);
287:
288: print "OK\n";
289:
290: # convert links from a referenced array to
291: # a normal array
292:
293: foreach $rlink(@$rlinkarray)
294: {
295:
296: push(@linkarray,$$rlink[0]);
297: }
298:
299: # create full URLs from links
300:
301: $baseurl=$response->base;
302: @linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
303:
304: foreach (@linkarray)
305: {
306: if($logtofile==1)
307: {
308: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
309: print LOGFILE " $_\n";
310: close(LOGFILE);
311: }
312:
313: }
314:
315: # put in seperate loop so that printout is correct
316: foreach (@linkarray)
317: {
318: # if link is http and does not contain
319: # any odd charcters then call this function
320: # recursively passing in the link
321:
322: if (/http:/i)
323: {
324: if (/[#\@\$]/)
325: {
326: }
327: else
328: {
329: if($drilldown == 1)
330: {
331: &gethtml($_);
332: }
333: }
334: }
335: }
336: }
337: else
338: {
339: print "Failed\n";
340: if($logtofile==1)
341: {
342: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
343: print LOGFILE "Processing: $_[0]...Failed\n";
344: close(LOGFILE);
345: }
346: }
347: }
348:
349: }
350:
351: # Used to find title tag, and description and keyword metatags.
352: sub treewalker
353: {
354: my ($node, $start, $depth) = @_;
355: if (ref $node)
356: {
357: my $tag = $node->tag;
358:
359: if ($tag eq "meta")
360: {
361: my $metaname=$node->attr("name");
362: if ($metaname ne "")
363: {
364: if ($metaname=~ /description/i)
365: {
366: my $description=$node->attr("content");
367:
368: # remove CR and LF from description.
369: $description =~ s/\n/ /sg;
370: $description =~ s/\r/ /sg;
371: $pagedescription = $description;
372: }
373: if ($metaname=~ /keywords/i)
374: {
375: my $keywords=$node->attr("content");
376:
377: # remove CR and LF from description.
378: $keywords =~ s/\n/ /sg;
379: $keywords =~ s/\r/ /sg;
380: $pagekeywords = $keywords;
381: }
382: }
383: }
384:
385: if ($tag eq "title" && $pagetitle eq "")
386: {
387: my $contentnodearray=$node->content;
388: foreach my $contentnode(@$contentnodearray)
389: {
390: if (not ref $contentnode)
391: {
392: $pagetitle=$contentnode;
393: }
394: }
395: }
396:
397:
398:
399: }
400: return 1; # This makes it recurse through all sub nodes
401: }
402:
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.
|