getimages

Maybe it's just my lack of ability with wget or similar items, anyway i rolled up my own downloader.
As the name says it was born to download just images linked in webpages (ie. big images behind thumbnails). Then it evolved in something useful for many tasks as it accept on the command line chunks of code that modify which files to download and other things (refer to man page for details).
I used lots of modules to write this. Still the only one that's not standard is Term::ProgressBar. Hope this program can be useful.
UPDATE: added prepend options
#!/usr/bin/perl

use strict;
use vars qw($ref_image @err_images $CURRENT_FILE);
use LWP::UserAgent;
use HTTP::Request::Common qw(GET);
use HTML::LinkExtor;
use Storable;
use Term::ProgressBar;
use Getopt::Long qw(:config bundling_override);
use Pod::Usage;
use constant CHUNK => 4096;
$| = 1;
$SIG{INT} = \&store_remaining;

our $user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98)';
our $file_type  = '/\.jpe?g$/i';
our $filter     = '';
our $prepend    = 0;
my ($dump, $resume);
our $STORE_FILE = '.getimages.dat';

GetOptions(
    "u|ua|user_agent=s"    => \$user_agent,
    "t|type|file_type=s"   => \$file_type,
    "f|filter=s"           => \$filter,
    "d|dump"               => \$dump,
    "r|resume"             => \$resume,
    "p|prepend=i"          => \$prepend,
    "h|help|?"             => sub {pod2usage(-verbose => 1)},
    "m|man"                => sub {pod2usage(-verbose => 2)},
    "s|store|store_file=s" => \$STORE_FILE,
) || pod2usage(-verbose => 0);

unless (@ARGV or -t) {
    chomp(@ARGV = <>);
}

our @rem_files = @ARGV;
our @rem_images = ();

my $use_store;
if ( $resume ) {
    my $ref = retrieve($STORE_FILE) || die "Cannot retrieve ", $STORE_
+FILE, $/;
    unshift @rem_files, @{ $ref->{files} };
    @rem_images = @{ $ref->{images} };
    @err_images = @{ $ref->{err_images} };
    $file_type  = $ref->{file_type};
    $filter     = $ref->{filter};
    $user_agent = $ref->{user_agent};
    $prepend    = $ref->{prepend};
    $use_store = 1;
}

my $sfile_type = sub {
    local $_ = shift;
    eval $file_type;
};

my $sfilter = sub {
    local $_ = shift;
    eval $filter;
    return $_;
};

my $ua = new LWP::UserAgent;
$ua->agent($user_agent);

getimages( \@rem_images ) if @rem_images;

while ( @rem_files ) {
    $CURRENT_FILE = shift @rem_files;
    my $response = $ua->get( $CURRENT_FILE );
    print STDERR "Error retrieving html file: ", $response->status_lin
+e, "\n" 
        and next unless $response->is_success;

    my ($baseurl) = $CURRENT_FILE =~ /^(.+\/).*?$/;
    my $parser = HTML::LinkExtor->new(undef, $baseurl);
    $parser->parse( $response->content );
    @rem_images = map {$sfilter->( $_->[2] )} 
                  grep {$_->[0] eq 'a' 
                        and $sfile_type->( $_->[2] )} $parser->links;
    print join("\n", @rem_images), "\n" and next if $dump;
    getimages( \@rem_images );
}

exit if $dump;

if (@err_images) {
    print STDERR "There were errors retrieving some images. Gonna retr
+y now.\n";
    @rem_images = @err_images;
    @err_images = ();
    getimages( \@rem_images );
    if (@err_images) {
        print STDERR "Still some errors...\n";
        store_remaining();
    }
}

unlink $STORE_FILE if $use_store;

print STDERR "Done.\n";

sub store_remaining {
    unshift @rem_files, $CURRENT_FILE unless @rem_images;
    store {
        files      => [@rem_files],
        images     => [map {"$_"} ($ref_image, @rem_images)],
        err_images => [map {"$_"} @err_images],
        file_type  => $file_type,
        filter     => $filter,
        user_agent => $user_agent,
        prepend    => $prepend
    }, $STORE_FILE;
    print <<EOF;

Status saved in "$STORE_FILE"
Launch again with "-resume" option to continue downloading from where 
+you left.
EOF
    exit;
}

sub getimage {
    my $url = shift;
    my @path = $url =~ m!(?:http://)?([^/]+)/?!g;
    my $image = join "_", splice( @path, -(
        abs($prepend) > $#path ? $#path : abs($prepend)
    ) - 1);
    $image = uniname( $image );
    my $inc_image = "__INCOMPLETE__$image";
    my ($i, $bytes, $size, $code, $meter) = (0, 0, 0, undef, undef);
    my $saveimage = sub {    
        my ($data, $response, $protocol) = @_;
        unless ($i++) {
            open(FH, ">", $inc_image) || 
                die "Cannot write $image: $@\n";
            $size = $response->headers->content_length;
            $code = $response->code;
            my $name = sprintf "$image (%s)", 
                                $size 
                                    ? sprintf("%.1fkb", $size / 1024) 
                                    : 'unknown size';
            if ($size) {
                $meter = Term::ProgressBar->new({
                    name  => $name,
                    count => $size,
                    ETA   => 'linear'
                });
                $meter->minor(0);
            }
        }
        print FH $data;
        if ($size) {
            $bytes += length $data;
            $meter->update($bytes) if $size;
        }
    };
    my $request = GET($url);
    $request->referer($CURRENT_FILE);
    my $response = $ua->request($request, $saveimage, CHUNK);
    close(FH);
    if ($code == 200 and ($size == -s $inc_image xor !$size) ) {
        rename $inc_image, $image;
    } else {
        printf "\nERROR: %d %s", $code || $response->code, $image;
        push @err_images, $url;
    }
    print "\n";
}

sub getimages {
    my $ref = shift;
    for ($ref_image = shift @$ref; $ref_image; $ref_image = shift @$re
+f) {
        getimage( $ref_image ) 
    }
}

sub uniname {
    my $fn = shift;
    return $fn unless -e $fn;
    my $ufn;
    for (my $i; -e ($ufn = sprintf "%03d_$fn", $i); $i++) {}
    return $ufn;
}

__END__

=head1 NAME

getimages - gets images (and more) whose links to are in a HTML page

=head1 SYNOPSIS

 getimages [OPTIONS] [URLs...]
 If no URL is specified getimages will try to read URLs from STDIN

 Options:
   (-t|-type) REGEXP   Download just the files whose names match REGEX
+P
                           (defaults to /\.jpe?g$/i)
   (-f|-filter) CODE   Modify URLs to download (in $_) according to CO
+DE
   (-d|-dump)          Prints the list of extracted URLs without
                           downloading them
   (-r|-resume)        Continue downloading from where you left last t
+ime
   (-ua|-user_agent)   Changes user-agent
   (-p|-prepend)       Prepend a path to the filename downloaded
   (-s|-store)         Changes the status save file
                           (defaults to ".getimages.dat")
   (-h|-help|-?)       Display a help message to STDOUT
   (-m|-man)           Display the full man page
   
   CTRL+C              Stops the program (save state in ".getimages.da
+t",
                           unless otherwise specified with "-s" option
+)

=head1 OPTIONS

=over 8

=item B<-t|-type> REGEXP

This option lets you specify a REGEXP (but that's really a code block)
+ that 
will be evaluated in a context where  $_ is the filename. 
The program will download just the filenames for which the code block 
returns a true value.

=item B<-f|-filter> CODE

This option will apply CODE in a context where $_ is the filename. 
You can modify it as you like.

=item B<-p|-prepend> INTEGER

This is the number of directories in the URL that will be prepended to
+ the 
filename. It defaults to 0. A traslation between '/' and '_' will 
also take place.  For example if the file you are trying to download i
+s
"http://www.domain.com/foo/bar/00.jpg" then if you set the prepend par
+ameter
to 1 the file will be saved as bar_00.jpg.

=back

=head1 DESCRIPTION

getimages was born to let you easily download big images that stands b
+ehind
thumbnails in most webpages. Then it evolved to let include chunks of 
+code
from the command line and to download whichever kind of file you like

=head1 EXAMPLES

 Download images linked at http://foo.org
 getimages http://foo.org/
 
 Download MPEG audio files from a list of URLs you got in a file 
 getimages -t '/\.mp\d$/i' < mympegURLlist.txt
 
 Go deeper one level and we want every pic
 getimages -dump -t '/pics.+?\.html?$/' http://foo.org/ | \
    getimages -t '/\.(gif|png|jpe?g)$/'

 Every image is displayed through a CGI page 
    but we don't want to download every page
 getimages -dump -t 'm!/pictures/.+\.shtml?$!' http://foo.org/ | \
     getimages -f 's!cgi-bin/view(\w+)?\.cgi\?!pictures/!'

=head1 COPYRIGHT

Copyright 2002 Giulio Motta L<giulienk@cpan.org>.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut
[download]
$|=$_="1g2i1u1l2i4e2n0k",map{print"\7",chop;select$,,$,,$,,$_/7}m{..}g
Comment on getimages Download Code

"be consistent"
	PerlMonks