Has anyone successfully used the Parallel::ForkManager to download web pages consistently? I am having two primary problems, 1) its not very fast for me (although in testing I am only using 3 to 4 urls so far, I can download many more pages faster without it). Also, the results of my script are inconsistent, sometimes I get all the requested pages, other times the script errors out using NT :( before getting all the pages. I suspect that I am not properly using the “sub wait_all_childs”. I believe that the script quits prematurely even though I have called the “wait” sub. I have inserted it in every place that I can think of and its still inconsistent. I’ve included an excerpt of the code. Please help. The target urls are for testing purposes only.
########################
use Parallel::ForkManager;
use LWP::Simple;
use LWP::UserAgent ;
use HTTP::Status ;
use HTTP::Request ;
%urls = ( 'drudge'=> 'http://www.drudgereport.com',
'rush' => 'http://www.rushlimbaugh.com/home/today.gu
+est.html',
'yahoo' => 'http://www.yahoo.com',
'cds' => 'http://www.cdsllc.com/',);
foreach $myURL (sort(values(%urls)))
{
$count++;
print "Count is $count\n";
$document = DOCUMENT_RETRIEVER($myURL);
}
sub DOCUMENT_RETRIEVER
{
$myURL=$_[0];
$mit = $myURL;
print "Commencing DOCUMENT_RETRIEVER number $iteration for $mit\n";
print "Iteration is $iteration and Count is $count\n";
for ($iteration = $count; $iteration <= $count; $iteration++)
{
$name = $iteration;
print "NAME $name\n" ;
my $pm=new Parallel::ForkManager(30);
$pm->start and next;
print "Starting Child Process $iteration for $mit\n" ;
$ua = LWP::UserAgent->new;
$ua->agent("$0/0.1 " . $ua->agent);
$req = new HTTP::Request 'GET' => "$mit";
$res = $ua->request($req, "$name.html"
print "Process $iteration Complete\n" ;
$pm->finish;
$pm->wait_all_childs;
print "Waiting on children\n";
}
undef $name;
}