1: #!/usr/local/bin/perl 2: 3: # This program uses the LWP module to read a web page 4: # and store its URL, Title, Description and Keywords in 5: # a database. It then follows the links on the page and 6: # processes any other pages it finds. The following 7: # switches are used to control how the program operates: 8: # /u specify the url to start at 9: # /c continue from the last saved restart point 10: # /f follow links on page and process pages found 11: # /r write next page to process to a restart file 12: # /d update the database 13: # /l produce logfile of crawl 14: # either /u or /c must be specified. 15: # 16: # This is my first attempt at a reasonable program in Perl 17: # so please feel free to criticise to your hearts content 18: # (its the only way I will learn). 19: 20: use HTML::TreeBuilder; 21: use LWP::RobotUA; 22: use URI::URL; 23: use DBI; 24: 25: # Setup user agent for robot 26: 27: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk'); 28: 29: $ua->delay(1); 30: 31: print $ua->agent; 32: print "\n"; 33: 34: # check if any arguements entered 35: 36: if(scalar(@ARGV) > 0) 37: { 38: my $startpage=""; 39: 40: local @visitedpages; 41: local $logtofile = 0; 42: local $drilldown = 0; 43: local $usedatabase = 0; 44: local $setforrestart =0; 45: local $continue=0; 46: local $logfilename; 47: 48: $|=1; # set autoflush on for logfile 49: 50: # check parameters and set appropriate variables. 51: 52: for(my $i=0;$i<scalar(@ARGV);$i++) 53: { 54: if($ARGV[$i] eq "/f") 55: { 56: $drilldown = 1; 57: } 58: if($ARGV[$i] eq "/l") 59: { 60: $logtofile = 1; 61: $logfilename=$ARGV[$i+1]; 62: } 63: if($ARGV[$i] eq "/u") 64: { 65: $startpage=$ARGV[$i+1]; 66: } 67: if($ARGV[$i] eq "/d") 68: { 69: $usedatabase = 1; 70: } 71: if($ARGV[$i] eq "/r") 72: { 73: $setforrestart = 1; 74: } 75: if($ARGV[$i] eq "/c") 76: { 77: $continue= 1; 78: } 79: 80: } 81: 82: if($logtofile==1) 83: { 84: open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n"; 85: close(LOGFILE); 86: } 87: 88: 89: # we do not want to visit pages already visited so keep an array 90: # of their URLs 91: 92: @visitedpages=(); 93: 94: if($usedatabase==1) 95: { 96: # if we are using the database then add all URLs from it 97: # to the list of visited pages. 98: 99: print "Building visited pages list..."; 100: 101: my $DSN = "DSN=PageData"; 102: 103: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n"; 104: 105: my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr; 106: 107: $sql_handle->execute() || die $dbh->errstr; 108: 109: while ( @row = $sql_handle->fetchrow_array() ) 110: { 111: push(@visitedpages,$row[0]); 112: } 113: 114: $dbh->disconnect(); 115: 116: print "done\n"; 117: } 118: 119: if($continue==1) 120: { 121: # if we are continuing then find which page to continue from 122: 123: open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n"; 124: my @continueeurl=<CONTINUE>; 125: foreach (@continueeurl) 126: { 127: $startpage=$_; 128: } 129: close(CONTINUE); 130: } 131: 132: if($startpage ne "") 133: { 134: &gethtml($startpage); 135: } 136: 137: 138: 139: 140: } 141: else 142: { 143: # No parameters entered so printout the usage information 144: 145: print "Usage:\n"; 146: print " perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n"; 147: print " where /u - url to start crawl from\n"; 148: print " /f - follow links on each page\n"; 149: print " /d - add page details to database\n"; 150: print " /r - save last accessed url for restart with /c\n"; 151: print " /c - continue from last restart-saved url\n"; 152: print " /l - output to logfile\n\n"; 153: print " note: either /u or /c must be specified\n\n"; 154: } 155: print ("Run Complete\n"); 156: 157: 158: # main routine 159: 160: sub gethtml 161: { 162: 163: my $html; 164: my $treeparse; 165: 166: my $rlink; 167: my @linkarray; 168: my $baseurl; 169: my $pagealreadyvisited; 170: my $pagetoprocess; 171: my $rlinkarray; 172: 173: local $pagetitle =""; 174: local $pagedescription = ""; 175: local $pagekeywords=""; 176: local $pagebaseurl=""; 177: 178: $pagetoprocess = $_[0]; 179: 180: if($setforrestart==1) 181: { 182: # write URL to restart file. 183: 184: open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n"; 185: print RESTARTFILE $pagetoprocess; 186: close(RESTARTFILE); 187: } 188: 189: # check we have not already visited this page 190: 191: $pagealreadyvisited=0; 192: 193: foreach (@visitedpages) 194: { 195: if($_ eq $pagetoprocess) 196: { 197: $pagealreadyvisited=1; 198: } 199: 200: } 201: 202: if ($pagealreadyvisited == 0) 203: { 204: print "Processing: $_[0]..."; 205: 206: # request the page 207: 208: $request = HTTP::Request->new('GET', $_[0]); 209: $response = $ua->request($request); 210: 211: if ($response->is_success) 212: { 213: if($logtofile==1) 214: { 215: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; 216: print LOGFILE "Processing: $_[0]...Response OK\n"; 217: close(LOGFILE); 218: } 219: 220: # parse retrieved HTML 221: 222: @linkarray=(); 223: $html=$response->content; 224: $treeparse=HTML::TreeBuilder->new; 225: $treeparse->parse($html); 226: 227: # extract anchor links 228: 229: $rlinkarray=$treeparse->extract_links("a"); 230: 231: # call treewalker function to check meta tags 232: 233: $treeparse->traverse(\&treewalker); 234: $treeparse->delete(); 235: $pagebaseurl=$response->base; 236: 237: if($logtofile==1) 238: { 239: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; 240: print LOGFILE " Title: $pagetitle\n"; 241: print LOGFILE " Description: $pagedescription\n"; 242: print LOGFILE " Keywords: $pagekeywords\n"; 243: print LOGFILE " Base URL: $pagebaseurl\n"; 244: close(LOGFILE); 245: } 246: 247: if($usedatabase==1) 248: { 249: 250: # write page details to database 251: 252: # DBI::ODBC falls over with any string 253: # longer than 255 254: 255: if(length($pagetitle)>255) 256: { 257: $pagetitle=substr($pagetitle,0,255); 258: } 259: 260: if(length($pagedescription)>255) 261: { 262: $pagedescription=substr($pagedescription,0,255); 263: } 264: 265: if(length($pagekeywords)>255) 266: { 267: $pagekeywords=substr($pagekeywords,0,255); 268: } 269: 270: my $DSN = "DSN=PageData"; 271: 272: my $dbh = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n"; 273: 274: my $sql_handle=$dbh->prepare(q{ 275: INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?) 276: }) || die $dbh->errstr; 277: 278: $sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords") 279: || die $dbh->errstr; 280: 281: $dbh->disconnect(); 282: 283: } 284: 285: # add page to visited pages array 286: 287: push(@visitedpages,$_[0]); 288: 289: print "OK\n"; 290: 291: # convert links from a referenced array to 292: # a normal array 293: 294: foreach $rlink(@$rlinkarray) 295: { 296: 297: push(@linkarray,$$rlink[0]); 298: } 299: 300: # create full URLs from links 301: 302: $baseurl=$response->base; 303: @linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray; 304: 305: foreach (@linkarray) 306: { 307: if($logtofile==1) 308: { 309: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; 310: print LOGFILE " $_\n"; 311: close(LOGFILE); 312: } 313: 314: } 315: 316: # put in seperate loop so that printout is correct 317: foreach (@linkarray) 318: { 319: # if link is http and does not contain 320: # any odd charcters then call this function 321: # recursively passing in the link 322: 323: if (/http:/i) 324: { 325: if (/[#\@\$]/) 326: { 327: } 328: else 329: { 330: if($drilldown == 1) 331: { 332: &gethtml($_); 333: } 334: } 335: } 336: } 337: } 338: else 339: { 340: print "Failed\n"; 341: if($logtofile==1) 342: { 343: open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n"; 344: print LOGFILE "Processing: $_[0]...Failed\n"; 345: close(LOGFILE); 346: } 347: } 348: } 349: 350: } 351: 352: # Used to find title tag, and description and keyword metatags. 353: sub treewalker 354: { 355: my ($node, $start, $depth) = @_; 356: if (ref $node) 357: { 358: my $tag = $node->tag; 359: 360: if ($tag eq "meta") 361: { 362: my $metaname=$node->attr("name"); 363: if ($metaname ne "") 364: { 365: if ($metaname=~ /description/i) 366: { 367: my $description=$node->attr("content"); 368: 369: # remove CR and LF from description. 370: $description =~ s/\n/ /sg; 371: $description =~ s/\r/ /sg; 372: $pagedescription = $description; 373: } 374: if ($metaname=~ /keywords/i) 375: { 376: my $keywords=$node->attr("content"); 377: 378: # remove CR and LF from description. 379: $keywords =~ s/\n/ /sg; 380: $keywords =~ s/\r/ /sg; 381: $pagekeywords = $keywords; 382: } 383: } 384: } 385: 386: if ($tag eq "title" && $pagetitle eq "") 387: { 388: my $contentnodearray=$node->content; 389: foreach my $contentnode(@$contentnodearray) 390: { 391: if (not ref $contentnode) 392: { 393: $pagetitle=$contentnode; 394: } 395: } 396: } 397: 398: 399: 400: } 401: return 1; # This makes it recurse through all sub nodes 402: } 403:
Back to
Craft