#!/usr/bin/perl -w
use strict;
# goose.pl
################
# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
#
# Copyright 2002. This may be modified and distributed
# on the same terms as Perl.
################
my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko
+/20020310 ';
my %proxy = (
host => '', # http://host.dom:port
id => '', # ntdom\userid
pass => '', # empty quotes if no proxy auth
);
use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent($browser);
$ua->proxy(http => "$proxy{host}")
if defined $proxy{host};
use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1
+, 1, "./"); # initialize and provide some defaults
GetOptions( 'images!' => \$images,
'offsite!' => \$offsite,
'help' => \$help,
'depth=i' => \$depth,
'path=s' => \$path,
'filename=s' => \$filename,
'compact!' => \$compact);
if ($help) {
print <<EOH;
goose.pl -- a utility to grab site from the web
---------------------------------
usage: goose.pl [options] location
options:
--images:
--noimages:
whether or not to download images
defaults to yes
--offsite:
--nooffsite:
whether or not to follow offsite links
defaults to no
--help
prints this help message
--depth
the link depth to follow
defaults to 0
--path
where to save the files to
--filename
the name of the first file. future names are
made using the increment operator (++)
--compact
--nocompact
whether or not to remove certain html tags
defaults to use
EOH
exit;
}
my $href = shift or die "Must provide a location to be goosed. Use opt
+ion --help for usage information.";
$path .= "/"
if $path !~ /[\/\\]$/;
my %links; # holds location and filename
$|++; # unbuffer output; allows progress to be shown
mkdir $path
if not -e $path;
#------------ setup ends here
{
my $count = 0;
sub update_progress {
print "\rGoosing $href... got $count files";
$count++
}
}
#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
my ($html, %tags) = @_;
for my $tag (keys %tags) {
# if it's not closed
if (not $tags{$tag}->{closed})
{
$$html =~ s! < \Q$tag\E .*? > !!xgis;
}
# if it's closed and we should remove content
elsif (not $tags{$tag}->{content})
{
1 while remove_tag($tag, $html);
}
# if it's not closed and we should leave content
elsif ($tags{$tag}->{content})
{
$$html =~ s! < /? \Q$tag\E .*? > !!xgis;
}
}
return $$html;
}
#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
my ($tag, $html) = @_;
my $pre = "";
my ($open, $content, $close);
while (1) {
$$html =~ m! (?<= \Q$pre\E ) (< \Q$tag\E [^>]* >) (.*?) (</ \
+Q$tag\E \s*? >) !xi
|| return 0;
($open, $content, $close) = ($1, $2, $3);
# if it has a nested tag of the same kind
last if $content !~ /<\Q$tag\E/;
$pre .= $open;
}
$$html =~ s/\Q$open$content$close\E//;
return 1;
}
#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
my %options = @_;
$options{location} || die "must give a location to goose";
$options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $
+1);
$links{$options{location}} ||= $filename++ . ".html";
my $count = 1; # number of files saved
my $page = get_page( $options{location} );
return 0 if not $page; # if get_page() fails
if ($options{compact}) {
compact( \$page, style => { closed => 1 },
meta => { closed => 0 },
link => { closed => 0} )
}
update_progress();
# receive just the files that still need to be goosed
my @files = extract_links( \$page,
$options{images},
$options{offsite},
$options{depth},
$options{master},
$options{location} );
# get images if specified
if ($options{images}) {
my @images = extract_images( \$page, $options{master}, $option
+s{location} );
for my $image (@images) {
$count += get_image($image);
}
}
open FILE, ">$options{path}$links{$options{location}}" || die "cou
+ldn't open $links{$options{location}}";
print FILE $page;
close FILE;
for my $file (@files) {
$count +=
goose( location => $file,
depth => $options{depth} - 1,
images => $options{images},
offsite => $options{offsite},
master => $options{master},
path => $options{path},
compact => $options{compact});
}
return $count;
}
#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
my ($location, $tryagain) = @_;
$tryagain = 1 if not defined $tryagain;
my $request = HTTP::Request->new(GET => $location);
$request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
if defined $proxy{id};
my $result = $ua->request($request);
return $result->content # if everything went right
if $result->is_success;
return get_page($location, 0) #try once more (default once)
if $tryagain;
return ""; # cop out
}
#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
my ($location, $tryagain) = @_;
$tryagain = 1 if not defined $tryagain;
my $request = HTTP::Request->new(GET => $location);
$request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
if defined $proxy{id};
my $result = $ua->request($request);
# if everything went right
if ($result->is_success) {
open FILE, ">" . $path . $links{$location} || die "couldn't op
+en $links{$location}";
binmode(FILE);
print FILE $result->content;
close FILE;
update_progress();
return 1;
}
return get_image($location, 0) #try once more (default once)
if $tryagain;
return 0; # cop out
}
#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
my ($html, $images, $offsite, $depth, $master, $parent_location) =
+ @_;
my @pages; # ones that still need to be goosed
$$html =~ s{ ( <a \s+ [^>]+ > ) }
{ new_link($1, "href", $depth, $offsite, $master, $parent
+_location, \@pages, "html") }xgei;
return @pages;
}
#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
my ($html, $master, $parent) = @_;
my @images;
$$html =~ s{ ( <img \s+ [^>]+ > ) }
{new_link($1, "src", 1, 1, $master, $parent, \@images
+)}xgei;
return @images
}
#feedit: link, type, depth, offsite, master, parent location, pages (r
+ef), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag
+es, $ext) = @_;
my $link = get_attribute($tag, $type) || return $tag;
# if ext is provided, get the ext of the file being saved
$ext ||= ($link =~ m!/[^/]+ \. (\w+) (?: \? [\w%&;\s#=]+ )?$!x &&
+ $1);
# if it's a mailto: link
return $tag
if $link =~ /^mailto:/;
# if it's javascript
return $tag
if $link =~ /^javascript:/;
my $fullpath = fullpath( $link, $parent_location );
$fullpath =~ m!^(https?://[^/]+)!;
my $root = $1;
# if it's already been goosed or is queued up
return set_attribute($tag, $type, $links{$fullpath})
if defined $links{$fullpath};
# if we're done goosing
return set_attribute($tag, $type, $fullpath)
if not $depth;
# if it's an offsite link and we don't want it
return $tag
if lc($root) ne lc($master)
&& not $offsite;
# else queue it up
push @$pages, $fullpath;
$links{$fullpath} = $filename++ . "." . $ext;
return set_attribute($tag, $type, $links{$fullpath})
}
#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
my ($relative, $found) = @_;
#if it's not a filename and doesn't have a /
$found .= "/"
if $found !~ m!https?://.*/.*!i;
return $relative
if $relative =~ m!^https?://!i;
$found =~ s![^/]+$!!; # remove filename at end
return $1 . $relative # /foo/bar
if $relative =~ s/^\///
&& $found =~ m!^(https?://.+?/)!i;
$found =~ s![^/]+/$!! # for relative urls (../whatever)
while $relative =~ s!^\.\./!!;
1 while $relative =~ s!^\./!!; # for urls relative to the current
+directory ( ./whatever)
return $found . $relative;
}
#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
my ($tag, $attribute) = @_;
# normal tag
return $2
if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
# tag with no quotes
return $1
if $tag =~ /\Q$attribute\E=([^\s>]+)/;
# else
return 0;
}
#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
my ($tag, $attribute, $value) = @_;
# normal tag
return $tag
if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va
+lue$2/xi;
# tag with no quotes
return $tag
if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i;
# else
die "\nERROR: couldn't set attribute '$attribute' to '$value' for
+'$tag'\n";
}
update_progress();
my $count = goose( location => $href,
depth => $depth,
offsite => $offsite,
images => $images,
path => $path,
compact => $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
" " x (length($count) + 6),
"\n$count files received\n";
|