in reply to Swimsuits2004
(Cue "spiderman" theme song...)
Here's my spider version for the current website, fetching everything from 1996 to 2004 (for now) and the ultimate and 50th edition images:
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/swimsuit/c
+ollection/";
while ($all_model_index =~ /(\/swimsuit\/collection\/models\/[-\w]+\.h
+tml)/g) {
my $model_index = get "http://sportsillustrated.cnn.com/$1";
while ($model_index =~ /\"(http:\/\/i\.cnn\.net\/si\/pr\/subs\/swims
+uit\/images\/)([-\w]+)t\.jpg\"/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
print "$url => $file: ";
if (-e $file) {
print "skip\n";
} else {
print mirror($url, $file), "\n";
}
}
}
Amazingly enough, this also fetches the "private" images that you should only be able to get if you're registered. Apparently, although the HTML pages are protected with their login, the images themselves are not, and the image thumbnails give away the full image names. Cool.
And once you get the results, you can symlink them by person with this:
#!/usr/bin/perl
use strict;
$|++;
-d "SORTED" or mkdir "SORTED" or die "mkdir SORTED: $!";
for (glob "RESULTS/*") {
my($basename, $person) = /RESULTS\/(.*?_(.*?)_[\db]+\.jpg)$/
or die "$_";
my $dir = "SORTED/$person";
-d $dir or mkdir $dir or die "mkdir $dir: $!";
my $target = $basename;
for ($target) {
s/^9/199/ or s/^0/200/; # patch up years
$_ = "$dir/$_";
}
-e $target or symlink "../../$_", $target or die "ln -s ../../$_ $ta
+rget: $!";
}
Re^2: Swimsuits2004
by demerphq (Chancellor) on Jan 20, 2005 at 19:24 UTC
|
I modified it to put a sleep(1+rand(5)) as the last statement of each loop. Given that this is downloading something like a thousand images I reckon maintaing a low profile (ie not hammering their servers) when you do it is probably a good thing.
| [reply] [d/l] |
Swimsuits2005
by merlyn (Sage) on Feb 15, 2005 at 18:38 UTC
|
The 2005 results just went online... I'm fetching them with this:
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/features/2
+005_swimsuit/";
while ($all_model_index =~ /(\/features\/2005_swimsuit\/models\/[-\w]+
+\.html)/g) {
my $model_index = get "http://sportsillustrated.cnn.com/$1";
while ($model_index =~ /\"(http:\/\/i\.a\.cnn\.net\/si\/features\/20
+05_swimsuit\/models\/images\/)([-\w]+)t\.jpg\"/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
print "$url => $file: ";
if (-e $file) {
print "skip\n";
} else {
print mirror($url, $file), "\n";
}
}
}
| [reply] [d/l] |
|
Mere minutes after the 2006 site went live, I reverse engineered it for this. This doesn't get the "subscription exclusive" shots as the prior versions did... they finally got smart and put it in a separate index. Nor does it grab the videos and a few of the other odd extra things. "Until next time, enjoy!"
#!/usr/bin/perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/features/2
+006_swimsuit/";
while ($all_model_index =~ /(\/features\/2006_swimsuit\/(allstar|model
+s)\/[-\w]+\.html)/g) {
doit("$1");
}
doit("/features/2006_swimsuit/allstar/allstar_reunion.html");
doit("/features/2006_swimsuit/athletes/");
doit("/features/2006_swimsuit/painting/");
sub doit {
my $model_index = get "http://sportsillustrated.cnn.com/" . shift;
while ($model_index =~ /\'(http:\/\/i.a.cnn.net\/si\/features\/2006_
+swimsuit\/images\/gallery\/photos\/)([\w.\-]+)t.jpg\'/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
if (-e $file) {
print "$url => $file: ";
print "skip\n";
} else {
print "$url => $file: ";
print mirror($url, $file), "\n";
}
}
}
| [reply] [d/l] |
|
#!/usr/bin/env perl
use strict;
$|++;
use LWP::Simple;
-d "RESULTS" or mkdir "RESULTS", 0755 or die "cannot mkdir RESULTS: $!
+";
my $all_model_index = get "http://sportsillustrated.cnn.com/features/2
+007_swimsuit/models/";
while ($all_model_index =~ /(\/features\/2007_swimsuit\/(models|painti
+ng|onlocation)\/[-\w]+\/)/g) {
doit("$1");
}
doit("/features/2007_swimsuit/beyonce/");
doit("/features/2007_swimsuit/3d/");
sub doit {
my $base = shift;
print "$base =>\n";
my $model_index = get "http://sportsillustrated.cnn.com/$base/index2
+.html";
unless ($model_index) {
$model_index = get "http://sportsillustrated.cnn.com/$base/";
}
while ($model_index =~ /\"(http:\/\/i.a.cnn.net\/si\/features\/2007_
+swimsuit\/images\/photos\/)([\w.\-]+)t.jpg\"/g) {
my $url = "$1$2.jpg";
my $file = "RESULTS/$2.jpg";
if (-e $file) {
print "$url => $file: ";
print "skip\n";
} else {
print "$url => $file: ";
print mirror($url, $file), "\n";
}
}
}
| [reply] [d/l] |
|
|
|
|
#!perl
use strict;
use warnings;
no warnings 'uninitialized';
use LWP::UserAgent;
use HTTP::Cookies;
if (!@ARGV or $ARGV[0] =~ m{^[-/]h(elp)?$} or $ARGV[0] =~ m{^[-/]\?$})
+ {
print <<'*END*';
getimagesx.pl [-r referer] url [prefix [-r referer] url prefix] ...
downloads all files in the sequence = finds the last number in the
+ URL
and keeps incrementing the number and downloading until it fails
three times in a row. The file will be named according to the part
+ of
the URL following the last slash.
If you specify just the prefix then the prefix will be prepended t
+o
the file names.
If you specify the -r followed by a URL then that URL will be
downloaded, cookies remembered and the URL will be sent to
the server as the HTTP_REFERER with the image requests.
*END*
exit;
}
my $ua = LWP::UserAgent->new(
env_proxy => 1,
keep_alive => 1,
timeout => 60,
agent => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; (R1 1
+.3); .NET CLR 1.0.3705)',
cookie_jar => HTTP::Cookies->new(),
);
our $referer = '';
while ($ARGV[0] =~ m{^[-/]r}) {
shift(@ARGV);
$referer = shift(@ARGV);
if ($referer !~ /##/) {
our $result = $ua->get( $referer, ':content_file' => 'referer.
+html');
if ($result->code() != 200) {
die "Failed to load the referer $referer\n";
}
}
}
while (@ARGV) {
my $url = shift(@ARGV);
if ($ARGV[0] =~ m{^[-/]r}) {
shift(@ARGV);
$referer = shift(@ARGV);
}
my $prefix = shift(@ARGV);
if ($ARGV[0] =~ m{^[-/]r}) {
shift(@ARGV);
$referer = shift(@ARGV);
}
if (! defined $prefix and $url =~ m{([^/]+)/\d+\.\w+$}) {
$prefix = $1;
print "Automatic prefix '$prefix'\n";
}
my $suffix = '';
if ($prefix =~ m{^(.*)/(.*)$}) {
($prefix, $suffix) = ($1, $2);
}
if ($url =~ m{^(.*/)(.*?)(\d+)([^0-9]*)$}) {
my ($url_beg, $file, $num, $file_end) = ($1,$2,$3,$4);
my ($result, $errors);
#print "Referer: $referer\n\n";
while ($errors < 10) {
local $referer = $referer;
if ($referer =~ s/##/$num/g) {
#print "Load referer: $referer\n";
$result = $ua->get( $referer, ':content_file' => 'refe
+rer.html');
if ($result->code() != 200) {
die "Failed to load the referer $referer\n";
}
}
if ($file =~ /[\&\?]/) {
print $url_beg.$file.$num.$file_end," => ",$prefix.$nu
+m.$file_end.$suffix,"\n";
$result = getstore($url_beg.$file.$num.$file_end, $pre
+fix.$num.$file_end.$suffix);
} else {
print $url_beg.$file.$num.$file_end," => ",$prefix.$fi
+le.$num.$file_end.$suffix,"\n";
$result = getstore($url_beg.$file.$num.$file_end, $pre
+fix.$file.$num.$file_end.$suffix);
}
if ($result == 200) {
$errors = 0;
} else {
$errors++;
print "\t$result\n";
}
$num++;
}
print "LAST RESULT: $result\n";
} else {
my $file = $url;
$file =~ s{^.*/}{};
$file = $prefix . $file;
if ($file) {
getstore( $url, $file);
} else {
print STDERR "Can't download directories! ($url)\n";
}
}
}
use Data::Dumper;
sub getstore {
my ($url, $file) = @_;
$file =~ s{(?:~(\d+))?(\.\w+)$}{'~' . ($1+1) . $2}e while (-e $fil
+e);
my $result = $ua->get($url, ':content_file' => $file, referer => $
+referer);
return $result->status_line() if $result->code() != 200;
if (wantarray()) {
return (200, $result->content_type);
} else {
if ($result->content_type =~ /^text/i) {
print "Bloody bastards. They do not return proper HTTP sta
+tus!\n";
# unlink $file;
return 404;
}
return 200;
}
}
Just in case you wanted all images in a series ;-)
| [reply] [d/l] |
|
|
Merlyn, I'm curious about something...
When you're whiping up short little scripts like this, do you find yourself using syntax like /a\/b\/c\/d/g instead of m{a/b/c/d}g out of habit from the old days when there wasn't much choice, or is it a conscious choice?
| [reply] [d/l] [select] |
|
I'm wrestling GNU Emacs "cperl-mode", which seems to not precisely appreciate the m{} and s{}{}g very well. But it seems to get the backwhacked version OK.
| [reply] [d/l] [select] |
|
|
merlyn,
This certainly isn't as succinct but I wanted to offer my contribution. It gets 110 of the 118 non-exclusive pics. All 8 of Marisa Miller's are skipped. It was fun - *shrug*.
I didn't bother fixing it to get the exclusive photos, but it shouldn't be too difficult. Incidently, there is a non-linked photo we both miss. Carolyn Murphy has a hidden #2 pic.
| [reply] [d/l] |
Re^2: Swimsuits2004
by zentara (Archbishop) on Jan 20, 2005 at 11:39 UTC
|
As usual, Merlyn does it best. :-) Maybe you should offer your service to SI, to better protect their "private" content? Then again, maybe not! :-)
I'm not really a human, but I play one on earth.
flash japh
| [reply] |
Re^2: Swimsuits2004
by reasonablekeith (Deacon) on Jan 26, 2005 at 23:26 UTC
|
May I suggest, for those who might not have linux, the following code, which will generate html link pages by genre and model name.
Please feel free to add html / body etc tags of the non compliance upsets you. Hmmm, I'm already thinking there should be a use cgi in there somewhere.
Enjoy.
#!/usr/bin/perl
use strict;
$|++;
my %files;
foreach my $file_name (glob "RESULTS/*") {
my ($top_level, $model_name) = ($file_name =~ m/RESULTS\/(.*?)_(.*?)
+_.*$/);
$top_level =~ s/^9/199/ or $top_level =~ s/^0/200/;
$files{$top_level}{$model_name}{$file_name} = 1;
$files{$model_name}{'all'}{$file_name} = 1;
}
-d 'HTML' or mkdir 'HTML' or die "mkdir HTML: $!";
open INDEX, ">HTML\\index.html";
foreach my $top_level (sort keys %files) {
print INDEX qq|<a href="./index_$top_level.html">$top_level</a><br/>
+\n|;
open TOP_INDEX, ">HTML\\index_$top_level.html";
foreach my $model_name (sort keys %{$files{$top_level}}) {
open MODEL_INDEX, ">HTML\\index_${top_level}_${model_name}.html";
my $image_count = 0;
foreach my $file_name (sort keys %{$files{$top_level}{$model_name}
+}) {
$image_count++;
print MODEL_INDEX qq|<img src="../$file_name"><br/>\n|;
}
print TOP_INDEX qq|<a href="./index_${top_level}_${model_name}.htm
+l">$model_name ($image_count)</a><br/>\n|;
}
}
| [reply] [d/l] |
Re^2: Swimsuits2004
by jbrugger (Parson) on Feb 14, 2005 at 07:29 UTC
|
Hmm...
Just could not help myself and create thumbs and links...
Have fun :-) (ps. feel free to improve it... that's the real fun, isn't it?)
#!/usr/bin/perl -w
use strict;
use Image::Magick;
use File::Glob qw(:globally :nocase);
use File::Basename;
use CGI;
my $realdir = qw( /var/www/html/thumbnailer/images/ ); # The dir tha
+t holds the origional images
my @ext = qw( jpg png gif); # Image exten
+tions to look for
my $savedir = qw( /var/www/html/thumbnailer/thumbs/ ); # Dir to save
+ thumbnails
my $serverdir = qw( /thumbnailer/ ); # Relative se
+rver-path
my $thumbdir = qw( thumbs/ ); # Holds the t
+humbnails for the webpage
my $imagedir = qw( images/ ); # Holds the r
+eal images for the webpage
my $x_size = 150; # Size in pix
+els.
my $y_size = 150; # Size in pix
+els.
my $resize = 1; # Resize befo
+re crop.
my $aspect_S = 0.5; # Only reseze
+ if aspect-ratio is above this value, else thumbnail becomes to blurr
+ed.
my $aspect_H = 2; # Only resize
+ if aspect-ratio is below this value, else thumbnaik becomes to blurr
+ed.
my $overwrite = 0; # Allways rem
+ake (overwrite) the thumbnails.
my $cols = 5; # Max horizon
+tal thumbs.
my $rows = 10; # Max vertica
+l thumbs.
my $cgi = new CGI;
main();
cleanUp();
sub main {
my $content = "<tr>";
my $files = readDir();
my $thumbs_per_page = $rows * $cols;
my $total = scalar(@$files) ? scalar(@$files) : 0;
my $pages = $total / $thumbs_per_page;
my $currentPage = $cgi->param('p') ? $cgi->param('p') : 1;
my $hasPrevious = $currentPage-1 ? 1 : 0;
my $hasNext = ($currentPage < $pages) ? 1 : 0 ;
my $startImage = (($currentPage-1) * $thumbs_per_page) ;
my $nav = "";
my $c = 1;
my $i = 0;
foreach my $file (@$files) {
$i++;
if ($i >= $total) {
$nav .= "<tr><td align=\"center\" nowrap=\"now
+rap\" colspan=\"$cols\">";
if ($hasPrevious) { $nav .= "<a href=\"?p=" .
+($currentPage - 1) . "\">Previous<\/a>\ \ "; }
if ($hasNext) { $nav .= "<a href=\"?p=" .
+($currentPage + 1) . "\">Next<\/a>"; }
$nav .= "<\/td><\/tr>";
}
next if ($i <= $startImage || $i > ($startImage + $thu
+mbs_per_page));
if ($c > $cols) {
$content .= "<\/tr><tr>\n";
$c = 1;
}
# Check if the file alreaddy exists:
my $filename = "thumb_" . fileparse($file);
if (!-e $savedir . $filename || $overwrite) {
# Make new thumbnails...
my $image = Image::Magick->new;
$image->Read($file);
my ($x,$y) = $image->Get('width', 'height');
# Enlarge image if thumbnail > origional, or r
+esize before crop is enabled...
if ($x < $x_size || $resize) {
my $aspectratio = $y / $x;
# Only resize if aspect-ratio is betwe
+en given apect ratio-borders
if ($aspectratio > $aspect_S && $aspec
+tratio < $aspect_H || $x < $x_size) {
$x = $x_size;
$y=$x * $aspectratio;
$image->Resize(width => $x, he
+ight => $y, filter => 'Cubic', blur => 1);
}
}
if ($y < $y_size) {
my $aspectratio = $x / $y;
$y = $y_size;
$x=$y * $aspectratio;
$image->Resize(width => $x, height =>
+$y, filter => 'Cubic', blur => 1);
}
# Get center (offset) of image, and crop to $x
+_size * $y_size.
my $ox = ($x - $x_size) / 2;
my $oy = ($y - $y_size) / 2;
$image->Crop("${x_size}x${y_size}+$ox+$oy");
$image->Write($savedir.$filename);
$content .= " <td> <a href=\"" . $serverdir .
+$imagedir . fileparse($file) . "\" > <img src=" . $serverdir . $thumb
+dir . $filename. " alt=\"\" border=\"1\"> <\/a><\/td> ";
} else {
# Skip writing...
$content .= " <td> <a href=\"" . $serverdir .
+$imagedir . fileparse($file) . "\" > <img src=" . $serverdir . $thumb
+dir . $filename. " alt=\"\" border=\"2\"> <\/a><\/td> ";
}
$c++;
}
$content .= "<\/tr>\n" . $nav;
printHtml($content);
}
sub printHtml {
my ($content) = @_;
# my $cgi = new CGI;
print $cgi->header(-type => 'text/html', );
print $cgi->start_html( -title => 'Testpage',
-BGCOLOR => '#ffffff',);
print "<table border=\"0\" cellpadding=\"0\" cellspacing=\"3\"
+>\n";
print $content;
print "\n<\/table>\n";
print $cgi->end_html;
}
sub readDir {
my $files="*.{" . join(",",@ext) . "}";
my @files= glob($realdir.$files);
return \@files;
}
sub cleanUp {
undef $cgi, $realdir, @ext, $serverdir, $savedir, $x_size, $co
+ls;
undef $y_size, $resize, $aspect_S , $aspect_H, $overwrite, $ro
+ws;
undef $thumbdir, $imagedir;
}
| [reply] [d/l] |
|
|