Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl Monk, Perl Meditation
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
    0: #!/usr/local/bin/perl
    1: 
    2: # This program uses the LWP module to read a web page
    3: # and store its URL, Title, Description and Keywords in
    4: # a database. It then follows the links on the page and
    5: # processes any other pages it finds. The following 
    6: # switches are used to control how the program operates:
    7: # 	/u specify the url to start at
    8: #	/c continue from the last saved restart point
    9: #	/f follow links on page and process pages found
    10: #	/r write next page to process to a restart file
    11: #	/d update the database
    12: #	/l produce logfile of crawl
    13: #   either /u or /c must be specified.
    14: #
    15: # This is my first attempt at a reasonable program in Perl
    16: # so please feel free to criticise to your hearts content
    17: # (its the only way I will learn).
    18: 
    19: use HTML::TreeBuilder;
    20: use LWP::RobotUA;
    21: use URI::URL;
    22: use DBI;
    23: 
    24: # Setup user agent for robot
    25: 
    26: $ua = LWP::RobotUA->new('ChilliBot/0.1.' . LWP::Version,'ChilliBot@nextmilltech.co.uk');
    27: 
    28: $ua->delay(1);
    29: 
    30: print $ua->agent;
    31: print "\n";
    32: 
    33: # check if any arguements entered
    34: 
    35: if(scalar(@ARGV) > 0)
    36: {
    37: 	my $startpage="";
    38: 
    39: 	local @visitedpages;
    40: 	local $logtofile = 0;
    41: 	local $drilldown = 0;
    42: 	local $usedatabase = 0;
    43: 	local $setforrestart =0;
    44: 	local $continue=0;
    45: 	local $logfilename;
    46: 
    47: 	$|=1; # set autoflush on for logfile
    48: 
    49: 	# check parameters and set appropriate variables.
    50: 
    51: 	for(my $i=0;$i<scalar(@ARGV);$i++)
    52: 	{
    53: 		if($ARGV[$i] eq "/f")
    54: 		{
    55: 			$drilldown = 1;
    56: 		}
    57: 		if($ARGV[$i] eq "/l")
    58: 		{
    59: 			$logtofile = 1;
    60: 			$logfilename=$ARGV[$i+1];
    61: 		}
    62: 		if($ARGV[$i] eq "/u")
    63: 		{
    64: 			$startpage=$ARGV[$i+1];
    65: 		}
    66: 		if($ARGV[$i] eq "/d")
    67: 		{
    68: 			$usedatabase = 1;
    69: 		}
    70: 		if($ARGV[$i] eq "/r")
    71: 		{
    72: 			$setforrestart = 1;
    73: 		}
    74: 		if($ARGV[$i] eq "/c")
    75: 		{
    76: 			$continue= 1;
    77: 		}
    78: 
    79: 	}
    80: 
    81: 	if($logtofile==1)
    82: 	{
    83: 		open(LOGFILE,">$logfilename") || die "Cannot open logfile $logfilename\n";
    84: 		close(LOGFILE);
    85: 	}
    86: 
    87: 
    88: 	# we do not want to visit pages already visited so keep an array
    89: 	# of their URLs
    90: 
    91: 	@visitedpages=();
    92: 
    93: 	if($usedatabase==1)
    94: 	{
    95: 		# if we are using the database then add all URLs from it
    96: 		# to the list of visited pages.
    97: 
    98: 		print "Building visited pages list...";
    99: 
    100: 		my $DSN = "DSN=PageData";
    101: 
    102: 		my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
    103: 
    104: 		my $sql_handle=$dbh->prepare("SELECT PageURL FROM Page") || die $dbh->errstr;
    105: 
    106: 		$sql_handle->execute() || die $dbh->errstr;
    107: 
    108: 		while ( @row = $sql_handle->fetchrow_array() )
    109: 		{
    110: 			push(@visitedpages,$row[0]);	
    111: 		}
    112: 
    113: 		$dbh->disconnect();
    114: 		
    115: 		print "done\n";
    116: 	}
    117: 
    118: 	if($continue==1)
    119: 	{
    120: 		# if we are continuing then find which page to continue from
    121: 
    122: 		open(CONTINUE,"restartwith.txt") || die "Cannot open restart file\n";
    123: 		my @continueeurl=<CONTINUE>;
    124: 		foreach (@continueeurl)
    125: 		{
    126: 			$startpage=$_;
    127: 		}
    128: 		close(CONTINUE);
    129: 	}
    130: 
    131: 	if($startpage ne "")
    132: 	{
    133: 		&gethtml($startpage);
    134: 	}
    135: 
    136: 
    137: 
    138: 
    139: }
    140: else
    141: {
    142: 	# No parameters entered so printout the usage information
    143: 
    144: 	print "Usage:\n";
    145: 	print "    perl robot.pl [/u start_url] [/f] [/d] [/r] [/c] [/s] [/l logfile]\n";
    146: 	print "        where /u - url to start crawl from\n";
    147: 	print "              /f - follow links on each page\n";
    148: 	print "              /d - add page details to database\n";
    149: 	print "              /r - save last accessed url for restart with /c\n";
    150: 	print "              /c - continue from last restart-saved url\n";
    151: 	print "              /l - output to logfile\n\n";
    152: 	print "	       note: either /u or /c must be specified\n\n";
    153: }
    154: print ("Run Complete\n");
    155: 
    156: 
    157: # main routine
    158: 
    159: sub gethtml
    160: {
    161: 
    162: 	my $html;
    163: 	my $treeparse;
    164: 
    165: 	my $rlink;
    166: 	my @linkarray;
    167: 	my $baseurl;
    168: 	my $pagealreadyvisited;
    169: 	my $pagetoprocess;
    170: 	my $rlinkarray;
    171: 
    172: 	local $pagetitle ="";
    173: 	local $pagedescription = "";
    174: 	local $pagekeywords="";
    175: 	local $pagebaseurl="";
    176: 
    177: 	$pagetoprocess = $_[0];
    178: 
    179: 	if($setforrestart==1)
    180: 	{
    181: 		# write URL to restart file.
    182: 
    183: 		open(RESTARTFILE,">restartwith.txt") || die "Cannot open restart file\n";
    184: 		print RESTARTFILE $pagetoprocess;
    185: 		close(RESTARTFILE);
    186: 	}
    187: 
    188: 	# check we have not already visited this page
    189: 
    190: 	$pagealreadyvisited=0;
    191: 
    192: 	foreach (@visitedpages)
    193: 	{
    194: 		if($_ eq $pagetoprocess)
    195: 		{
    196: 			$pagealreadyvisited=1;
    197: 		}
    198: 
    199: 	}
    200: 
    201: 	if ($pagealreadyvisited == 0)
    202: 	{
    203: 		print "Processing: $_[0]...";
    204: 
    205: 		# request the page
    206: 
    207: 		$request = HTTP::Request->new('GET', $_[0]);		
    208: 		$response = $ua->request($request); 	
    209: 
    210: 		if ($response->is_success) 
    211: 		{		    
    212: 			if($logtofile==1)
    213:      			{
    214: 				open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
    215:      				print LOGFILE "Processing: $_[0]...Response OK\n";
    216: 				close(LOGFILE);
    217: 		  	}
    218: 			
    219: 			# parse retrieved HTML
    220: 			
    221: 			@linkarray=();
    222: 			$html=$response->content;
    223: 			$treeparse=HTML::TreeBuilder->new;
    224: 			$treeparse->parse($html);
    225: 			
    226: 			# extract anchor links
    227: 
    228: 			$rlinkarray=$treeparse->extract_links("a");
    229: 
    230: 			# call treewalker function to check meta tags
    231: 
    232: 			$treeparse->traverse(\&treewalker);
    233: 			$treeparse->delete();
    234: 			$pagebaseurl=$response->base;
    235: 
    236: 			if($logtofile==1)
    237: 		     			{
    238: 						open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
    239: 		     				print LOGFILE "    Title: $pagetitle\n";
    240: 		     				print LOGFILE "    Description: $pagedescription\n";
    241: 		     				print LOGFILE "    Keywords: $pagekeywords\n";
    242: 						print LOGFILE "    Base URL: $pagebaseurl\n";
    243: 						close(LOGFILE);
    244: 		     			}
    245: 	
    246: 			if($usedatabase==1)
    247: 			{
    248: 
    249: 				# write page details to database
    250: 
    251: 				# DBI::ODBC falls over with any string
    252: 				# longer than 255
    253: 
    254: 				if(length($pagetitle)>255)
    255: 				{
    256: 					$pagetitle=substr($pagetitle,0,255);
    257: 				}
    258: 
    259: 				if(length($pagedescription)>255)
    260: 				{
    261: 					$pagedescription=substr($pagedescription,0,255);
    262: 				}
    263: 				
    264: 				if(length($pagekeywords)>255)
    265: 				{
    266: 					$pagekeywords=substr($pagekeywords,0,255);
    267: 				}
    268: 								
    269: 				my $DSN = "DSN=PageData";
    270: 
    271: 				my $dbh  = DBI->connect("dbi:ODBC:$DSN") || die "$DBI::errstr\n";
    272: 
    273: 				my $sql_handle=$dbh->prepare(q{
    274: 						INSERT INTO Page (PageURL, Title, Description,Keywords) VALUES (?, ?, ?, ?)
    275: 						}) || die $dbh->errstr;
    276: 
    277: 				$sql_handle->execute("$_[0]","$pagetitle","$pagedescription","$pagekeywords")
    278: 						    || die $dbh->errstr;
    279: 
    280: 				$dbh->disconnect();
    281: 
    282: 			}
    283: 
    284: 			# add page to visited pages array
    285: 
    286: 			push(@visitedpages,$_[0]);	
    287: 
    288: 			print "OK\n";
    289: 
    290: 			# convert links from a referenced array to
    291: 			# a normal array
    292: 			
    293: 			foreach $rlink(@$rlinkarray)
    294: 			{
    295: 	
    296: 				push(@linkarray,$$rlink[0]);	
    297: 			}
    298: 
    299: 			# create full URLs from links
    300: 
    301: 			$baseurl=$response->base;
    302: 			@linkarray = map { $_= url($_, $baseurl)->abs; } @linkarray;
    303: 
    304: 			foreach (@linkarray)
    305: 			{
    306: 				if($logtofile==1)
    307: 		     		{
    308: 					open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
    309: 		     			print LOGFILE "    $_\n";
    310: 					close(LOGFILE);
    311: 		     		}
    312: 
    313: 			}
    314: 		
    315: 			# put in seperate loop so that printout is correct
    316: 			foreach (@linkarray)
    317: 			{
    318: 				# if link is http and does not contain
    319: 				# any odd charcters then call this function
    320: 				# recursively passing in the link
    321: 
    322: 				if (/http:/i)
    323: 				{
    324: 					if (/[#\@\$]/)
    325: 					{
    326: 					}
    327: 					else
    328: 					{
    329: 						if($drilldown == 1)
    330: 						{
    331: 							&gethtml($_);
    332: 						}
    333: 					}
    334: 				}
    335: 			}
    336: 		} 
    337: 		else 
    338: 		{
    339: 		     print "Failed\n";
    340: 		     if($logtofile==1)
    341:      		     {
    342: 			open(LOGFILE,">>$logfilename") || die "Cannot open logfile $logfilename\n";
    343:      			print LOGFILE "Processing: $_[0]...Failed\n";
    344: 			close(LOGFILE);
    345: 		     }
    346: 		}
    347: 	}
    348: 
    349: }
    350: 
    351: # Used to find title tag, and description and keyword metatags.
    352: sub treewalker
    353: {
    354: 	my ($node, $start, $depth) = @_;
    355:         if (ref $node) 
    356: 	{
    357:              my $tag = $node->tag;
    358: 
    359: 	     if ($tag eq "meta")
    360: 	     {
    361: 		my $metaname=$node->attr("name");
    362: 		if ($metaname ne "")
    363: 		{
    364: 			if ($metaname=~ /description/i)
    365: 			{
    366: 				my $description=$node->attr("content");
    367: 
    368: 				# remove CR and LF from description.
    369: 				$description =~ s/\n/ /sg;
    370: 				$description =~ s/\r/ /sg;
    371: 				$pagedescription = $description;		
    372: 			}
    373: 			if ($metaname=~ /keywords/i)
    374: 			{
    375: 				my $keywords=$node->attr("content");
    376: 
    377: 				# remove CR and LF from description.
    378: 				$keywords =~ s/\n/ /sg;
    379: 				$keywords =~ s/\r/ /sg;
    380: 				$pagekeywords = $keywords;		
    381: 			}
    382: 		}
    383:      	     }     
    384: 	
    385: 	     if ($tag eq "title" && $pagetitle eq "")
    386: 	     {
    387: 		my $contentnodearray=$node->content;
    388: 		foreach my $contentnode(@$contentnodearray)
    389: 		{
    390: 			if (not ref $contentnode)
    391: 			{
    392: 				$pagetitle=$contentnode;		
    393: 			}
    394: 		}
    395: 	     }
    396: 	
    397: 	
    398: 
    399: 	}
    400: 	return 1; # This makes it recurse through all sub nodes
    401: }
    402:  

In reply to ChilliBot Web Crawler by ChilliHead

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others surveying the Monastery: (4)
As of 2024-04-23 19:38 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found