comment on

#!/usr/bin/perl -w

#------------------------------------------------------------#
# Scrape images.google.com for images matching a specific    #
# keyword.                                                   #
#------------------------------------------------------------#
# ./imgo.pl --query "perl monks"                             #
#------------------------------------------------------------#
use HTML::Parser;
use LWP::UserAgent;
use Parallel::ForkManager;
use Getopt::Long;
use URI::Escape;
use strict;

#------------------------------------------------------------#
# Options and other variables we'll need.                    #
#------------------------------------------------------------#

# Defaults
my %opt = (
   dir   => ".",
   safe  => "0",
   procs => "20",
   ua    => "Mozilla/1.0",
   query => "",
);

# Options from the commandline.
GetOptions( 
   'verbose' => \$opt{'verbose'},
   'help'    => \$opt{'help'   },
   'safe'    => \$opt{'safe'   },
   'query=s' => \$opt{'query'  },
   'procs=i' => \$opt{'procs'  },
   'ua=s'    => \$opt{'ua'     },
   'dir=s'   => \$opt{'dir'    },
);

# Compose our base URL for images.google.com.
$opt{'query'} = uri_escape($opt{'query'});

my $url = "http://images.google.com/images" . 
   "?q=$opt{'query'}" .
   "\&safe=" . ($opt{'safe'} ? "on" : "off");

# Initial image offset (Page 1 of results)
my $start = "0";

# Validate input and display help if needed.
&help if ($opt{'help'} || !$opt{'query'});

#------------------------------------------------------------#
# Create objects we'll need.                                 #
#------------------------------------------------------------#

# LWP for HTTP requests.
my $ua = new LWP::UserAgent;
$ua->agent($opt{'ua'}); # Google doesn't like LWP.
 
# HTML::Parser for scraping HTML.
my $p = new HTML::Parser (
   api_version => 3,
   start_h     => [\&tag, "tagname, attr"],
);

# Parallel::ForkManager to handle simultaneous downloads.
my $pfm = new Parallel::ForkManager($opt{'procs'});

#------------------------------------------------------------#
# Parse each page of HTML for images.  Stored in @images.    #
#------------------------------------------------------------#
# $start will be passed to google to tell it which page of   #
# results to display.  20 images per page.                   #
#------------------------------------------------------------#
# $test is used to see if we need another page.              #
#------------------------------------------------------------#
my @images;
my $done = 0;
my $page = 1;
until ($done) {
   $opt{'verbose'} && print "Fetching page " . $page++ . " of results.
+\n";
   my $test = $start;
   my $req = HTTP::Request->new(GET => $url . "\&start=$start");
   $p->parse($ua->request($req)->content);
   $done = 1 if $test == $start;
}

#------------------------------------------------------------#
# Fetch all images stored in @images.                        #
#------------------------------------------------------------#
foreach my $img (@images) {

   # Fork a child to execute code in this loop.
   $pfm->start and next;

   # Get our image URL, refering URL and a unique filename.
   my ($imgurl, $filename, $refurl) = @$img;
   $filename = unique($filename);

   $opt{'verbose'} && print "Fetching $imgurl as $filename\n";

   # Download the image and save it to disk.
   my $req = HTTP::Request->new(GET => "http://$imgurl");
   $req->referer($refurl);
   $ua->request($req, "$opt{'dir'}/$filename");

   # Indicate this child process is finished.
   $pfm->finish;
}

#------------------------------------------------------------#
# Wait for all children to finish and exit cleanly.          #
#------------------------------------------------------------#
$pfm->wait_all_children;
exit 0;

#------------------------------------------------------------#
# tag() is our HTML::Parser callback for handling start tags #
#------------------------------------------------------------#
sub tag {
   my ($tagname, $attr) = (@_);

   #
   # If we see the "nav_next.gif" image, we know we should go
   # to the next page to collect more images.  $start is our
   # offset for the next page.
   #
   if ($attr->{'src'} && ($attr->{'src'} eq "/nav_next.gif" )) {
      $start += 20;
   }

   #
   # Look for links to "imgres".  This will show our image URL
   # and the page it's used on.  We'll use the latter to spoof
   # our refering URL in case the host doesn't allow offsite
   # image linking (tripod, etc.).
   #
   return unless ($tagname eq 'a');
   return unless (
      $attr->{'href'} =~ /imgres\?imgurl=(.*\/([^\&]*))\&imgrefurl=([^
+\&]*)\&/
   );

   #
   # We've got a real image, so we'll remember it for downloading.
   #
   push(@images, [ $1, $2, $3 ]); # imgurl, filename, refurl

}

#------------------------------------------------------------#
# unique() ensures we're not overwriting existing files by   #
# returning an unused filename based on the one provided.    #
#------------------------------------------------------------#
sub unique {
   my $f = shift;
   return $f unless -e "$opt{'dir'}/$f";

   my $count = 1;
   while (-e "$opt{'dir'}/$count.$f") {
      $count++;
   }

   return "$count.$f";
}

#------------------------------------------------------------#
# help() displays usage information.                         #
#------------------------------------------------------------#
sub help {

print <<ENDHELP

$0 scrapes images.google.com for images matching the keyword
specified on the commandline.  Images are downloaded and placed
in the current directory by default.

Usage:   $0 --query "image keyword(s)" [OPTIONS]

Options:

   --query string  Search string for images.
                   Required.  No default.

   --verbose       Show what the script is doing as it goes.
                   Defaults to off.
                
   --safe          Use google's safesearch to filter naughty pictures.
                   Defaults to off.

   --procs n       Number of simultaneous image downloads to run.
                   Defaults to 20.

   --dir path      Directory to store downloaded images to.
                   Defaults to "." (current directory)

   --ua string     images.google.com doesn't like robots.  This is
                   the user-agent string we spoof.
                   Defaults to "Mozilla/1.0"

   --help          You're looking at it, cowboy.

Notes:

   Images are given unique filenames by prepending a number.  For
   example, "10.header.jpg"

   Usage may violate Google's TOS.  Use at your own risk.

ENDHELP

}
[download]

In reply to Scrape Google's Image Search by Hutta

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


No such thing as a small change
	PerlMonks