or similar items, anyway i rolled up my own downloader.
As the name says it was born to download just images linked in webpages (ie. big images behind thumbnails). Then it evolved in something useful for many tasks as it accept on the command line chunks of code that modify which files to download and other things (refer to man page for details).
#!/usr/bin/perl
use strict;
use vars qw($ref_image @err_images $CURRENT_FILE);
use LWP::UserAgent;
use HTTP::Request::Common qw(GET);
use HTML::LinkExtor;
use Storable;
use Term::ProgressBar;
use Getopt::Long qw(:config bundling_override);
use Pod::Usage;
use constant CHUNK => 4096;
$| = 1;
$SIG{INT} = \&store_remaining;
our $user_agent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98)';
our $file_type = '/\.jpe?g$/i';
our $filter = '';
our $prepend = 0;
my ($dump, $resume);
our $STORE_FILE = '.getimages.dat';
GetOptions(
"u|ua|user_agent=s" => \$user_agent,
"t|type|file_type=s" => \$file_type,
"f|filter=s" => \$filter,
"d|dump" => \$dump,
"r|resume" => \$resume,
"p|prepend=i" => \$prepend,
"h|help|?" => sub {pod2usage(-verbose => 1)},
"m|man" => sub {pod2usage(-verbose => 2)},
"s|store|store_file=s" => \$STORE_FILE,
) || pod2usage(-verbose => 0);
unless (@ARGV or -t) {
chomp(@ARGV = <>);
}
our @rem_files = @ARGV;
our @rem_images = ();
my $use_store;
if ( $resume ) {
my $ref = retrieve($STORE_FILE) || die "Cannot retrieve ", $STORE_
+FILE, $/;
unshift @rem_files, @{ $ref->{files} };
@rem_images = @{ $ref->{images} };
@err_images = @{ $ref->{err_images} };
$file_type = $ref->{file_type};
$filter = $ref->{filter};
$user_agent = $ref->{user_agent};
$prepend = $ref->{prepend};
$use_store = 1;
}
my $sfile_type = sub {
local $_ = shift;
eval $file_type;
};
my $sfilter = sub {
local $_ = shift;
eval $filter;
return $_;
};
my $ua = new LWP::UserAgent;
$ua->agent($user_agent);
getimages( \@rem_images ) if @rem_images;
while ( @rem_files ) {
$CURRENT_FILE = shift @rem_files;
my $response = $ua->get( $CURRENT_FILE );
print STDERR "Error retrieving html file: ", $response->status_lin
+e, "\n"
and next unless $response->is_success;
my ($baseurl) = $CURRENT_FILE =~ /^(.+\/).*?$/;
my $parser = HTML::LinkExtor->new(undef, $baseurl);
$parser->parse( $response->content );
@rem_images = map {$sfilter->( $_->[2] )}
grep {$_->[0] eq 'a'
and $sfile_type->( $_->[2] )} $parser->links;
print join("\n", @rem_images), "\n" and next if $dump;
getimages( \@rem_images );
}
exit if $dump;
if (@err_images) {
print STDERR "There were errors retrieving some images. Gonna retr
+y now.\n";
@rem_images = @err_images;
@err_images = ();
getimages( \@rem_images );
if (@err_images) {
print STDERR "Still some errors...\n";
store_remaining();
}
}
unlink $STORE_FILE if $use_store;
print STDERR "Done.\n";
sub store_remaining {
unshift @rem_files, $CURRENT_FILE unless @rem_images;
store {
files => [@rem_files],
images => [map {"$_"} ($ref_image, @rem_images)],
err_images => [map {"$_"} @err_images],
file_type => $file_type,
filter => $filter,
user_agent => $user_agent,
prepend => $prepend
}, $STORE_FILE;
print <<EOF;
Status saved in "$STORE_FILE"
Launch again with "-resume" option to continue downloading from where
+you left.
EOF
exit;
}
sub getimage {
my $url = shift;
my @path = $url =~ m!(?:http://)?([^/]+)/?!g;
my $image = join "_", splice( @path, -(
abs($prepend) > $#path ? $#path : abs($prepend)
) - 1);
$image = uniname( $image );
my $inc_image = "__INCOMPLETE__$image";
my ($i, $bytes, $size, $code, $meter) = (0, 0, 0, undef, undef);
my $saveimage = sub {
my ($data, $response, $protocol) = @_;
unless ($i++) {
open(FH, ">", $inc_image) ||
die "Cannot write $image: $@\n";
$size = $response->headers->content_length;
$code = $response->code;
my $name = sprintf "$image (%s)",
$size
? sprintf("%.1fkb", $size / 1024)
: 'unknown size';
if ($size) {
$meter = Term::ProgressBar->new({
name => $name,
count => $size,
ETA => 'linear'
});
$meter->minor(0);
}
}
print FH $data;
if ($size) {
$bytes += length $data;
$meter->update($bytes) if $size;
}
};
my $request = GET($url);
$request->referer($CURRENT_FILE);
my $response = $ua->request($request, $saveimage, CHUNK);
close(FH);
if ($code == 200 and ($size == -s $inc_image xor !$size) ) {
rename $inc_image, $image;
} else {
printf "\nERROR: %d %s", $code || $response->code, $image;
push @err_images, $url;
}
print "\n";
}
sub getimages {
my $ref = shift;
for ($ref_image = shift @$ref; $ref_image; $ref_image = shift @$re
+f) {
getimage( $ref_image )
}
}
sub uniname {
my $fn = shift;
return $fn unless -e $fn;
my $ufn;
for (my $i; -e ($ufn = sprintf "%03d_$fn", $i); $i++) {}
return $ufn;
}
__END__
=head1 NAME
getimages - gets images (and more) whose links to are in a HTML page
=head1 SYNOPSIS
getimages [OPTIONS] [URLs...]
If no URL is specified getimages will try to read URLs from STDIN
Options:
(-t|-type) REGEXP Download just the files whose names match REGEX
+P
(defaults to /\.jpe?g$/i)
(-f|-filter) CODE Modify URLs to download (in $_) according to CO
+DE
(-d|-dump) Prints the list of extracted URLs without
downloading them
(-r|-resume) Continue downloading from where you left last t
+ime
(-ua|-user_agent) Changes user-agent
(-p|-prepend) Prepend a path to the filename downloaded
(-s|-store) Changes the status save file
(defaults to ".getimages.dat")
(-h|-help|-?) Display a help message to STDOUT
(-m|-man) Display the full man page
CTRL+C Stops the program (save state in ".getimages.da
+t",
unless otherwise specified with "-s" option
+)
=head1 OPTIONS
=over 8
=item B<-t|-type> REGEXP
This option lets you specify a REGEXP (but that's really a code block)
+ that
will be evaluated in a context where $_ is the filename.
The program will download just the filenames for which the code block
returns a true value.
=item B<-f|-filter> CODE
This option will apply CODE in a context where $_ is the filename.
You can modify it as you like.
=item B<-p|-prepend> INTEGER
This is the number of directories in the URL that will be prepended to
+ the
filename. It defaults to 0. A traslation between '/' and '_' will
also take place. For example if the file you are trying to download i
+s
"http://www.domain.com/foo/bar/00.jpg" then if you set the prepend par
+ameter
to 1 the file will be saved as bar_00.jpg.
=back
=head1 DESCRIPTION
getimages was born to let you easily download big images that stands b
+ehind
thumbnails in most webpages. Then it evolved to let include chunks of
+code
from the command line and to download whichever kind of file you like
=head1 EXAMPLES
Download images linked at http://foo.org
getimages http://foo.org/
Download MPEG audio files from a list of URLs you got in a file
getimages -t '/\.mp\d$/i' < mympegURLlist.txt
Go deeper one level and we want every pic
getimages -dump -t '/pics.+?\.html?$/' http://foo.org/ | \
getimages -t '/\.(gif|png|jpe?g)$/'
Every image is displayed through a CGI page
but we don't want to download every page
getimages -dump -t 'm!/pictures/.+\.shtml?$!' http://foo.org/ | \
getimages -f 's!cgi-bin/view(\w+)?\.cgi\?!pictures/!'
=head1 COPYRIGHT
Copyright 2002 Giulio Motta L<giulienk@cpan.org>.
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
$|=$_="1g2i1u1l2i4e2n0k",map{print"\7",chop;select$,,$,,$,,$_/7}m{..}g