comment on

#!/usr/bin/perl -w
use strict;

# goose.pl

################
# Written by Matthew Diephouse
# Contact at "matt --at-- diephouse --dot-- com"
# where "--at--" is @ and "--dot--" is .
#
# Copyright 2002.  This may be modified and distributed
# on the same terms as Perl.
################

my $browser = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:0.9.9) Gecko
+/20020310 ';
my %proxy = (
  host => '',    # http://host.dom:port
  id   => '',    # ntdom\userid
  pass => '',    # empty quotes if no proxy auth
);

use LWP::UserAgent;
my $ua = new LWP::UserAgent;
$ua->agent($browser);
$ua->proxy(http => "$proxy{host}")
    if defined $proxy{host}; 

use Getopt::Long;
my ($images, $compact, $path, $filename, $depth, $offsite, $help) = (1
+, 1, "./"); # initialize and provide some defaults
GetOptions(  'images!'    => \$images,
            'offsite!'    => \$offsite,
            'help'        => \$help,
            'depth=i'    => \$depth,
            'path=s'    => \$path,
            'filename=s' => \$filename,
            'compact!'    => \$compact);

if ($help) {
    print <<EOH;
goose.pl -- a utility to grab site from the web
---------------------------------
usage: goose.pl [options] location
options:
    --images:
    --noimages:
        whether or not to download images
        defaults to yes
    --offsite:
    --nooffsite:
        whether or not to follow offsite links
        defaults to no
    --help
        prints this help message
    --depth
        the link depth to follow
        defaults to 0
    --path
        where to save the files to
    --filename
        the name of the first file. future names are
        made using the increment operator (++)
    --compact
    --nocompact
        whether or not to remove certain html tags
        defaults to use
EOH
    exit;    
}

my $href = shift or die "Must provide a location to be goosed. Use opt
+ion --help for usage information.";

$path .= "/"
    if $path !~ /[\/\\]$/;

my %links;    # holds location and filename
$|++;         # unbuffer output; allows progress to be shown
mkdir $path
    if not -e $path;

#------------ setup ends here

{
    my $count = 0;
    sub update_progress {
        print "\rGoosing $href... got $count files";
        $count++
    }
}

#feedit: html (ref), hash with tags to cut and options (hash ref)
#receive: compacted version
#effect: compacts ref
sub compact {
    my ($html, %tags) = @_;
    
    for my $tag (keys %tags) {
        # if it's not closed
        if (not $tags{$tag}->{closed})
        {
            $$html =~ s! < \Q$tag\E .*? > !!xgis;
        }
        
        # if it's closed and we should remove content
        elsif (not $tags{$tag}->{content})
        {
            1 while remove_tag($tag, $html);
        }
        
        # if it's not closed and we should leave content
        elsif ($tags{$tag}->{content})
        {
            $$html =~ s! < /? \Q$tag\E .*? > !!xgis;
        }
    }
    return $$html;
}

#feedit: tag name, html (ref)
#receive: success indicator
#effect: remove's tag from html
sub remove_tag {
    my ($tag, $html) = @_;
    my $pre = "";
    my ($open, $content, $close);
    
    while (1) {
        $$html =~ m! (?<= \Q$pre\E )  (< \Q$tag\E [^>]* >) (.*?) (</ \
+Q$tag\E \s*? >) !xi
            || return 0;
    
        ($open, $content, $close) = ($1, $2, $3);
        
        # if it has a nested tag of the same kind
        last if $content !~ /<\Q$tag\E/;
        
        $pre .= $open;
    }
    
    $$html =~ s/\Q$open$content$close\E//;
    
    return 1;
}


#feedit: location, depth, images?, offsite?, path
#receive: number of files
#effect: save files
sub goose {
    my %options = @_;
    $options{location} || die "must give a location to goose";
    $options{master} ||= ($options{location} =~ m!(http://[^/]+)! && $
+1);

    $links{$options{location}} ||= $filename++ . ".html";

    my $count = 1; # number of files saved
    my $page = get_page( $options{location} );
    return 0 if not $page; # if get_page() fails
    
    if ($options{compact}) {
        compact( \$page, style => { closed => 1 },
                        meta => { closed => 0 },
                        link   => { closed => 0} )
    }

    update_progress();
    
    # receive just the files that still need to be goosed
    my @files = extract_links( \$page,
                        $options{images},
                        $options{offsite},
                        $options{depth},
                        $options{master},
                        $options{location} );
    # get images if specified
    if ($options{images}) {
        my @images = extract_images( \$page, $options{master}, $option
+s{location} );

        for my $image (@images) {
            $count += get_image($image);
        }
    }
        
    open FILE, ">$options{path}$links{$options{location}}" || die "cou
+ldn't open $links{$options{location}}";
    print FILE $page;
    close FILE;

    for my $file (@files) {
        $count +=
             goose( location => $file,
                depth    => $options{depth} - 1,
                images    => $options{images},
                offsite    => $options{offsite},
                master    => $options{master},
                path        => $options{path},
                compact    => $options{compact});
    }
    
    return $count;
}

#feedit: the name of the page to be retrieved
#receive: the html
#effect: none
sub get_page {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;
    
    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);
    
    return $result->content # if everything went right
        if $result->is_success;
    
    return get_page($location, 0) #try once more (default once)
        if $tryagain;
    return ""; # cop out
}

#feedit: the name of the image to be retrieved
#receive: the number of files saved
#effect: saves the files
sub get_image {
    my ($location, $tryagain) = @_;
    $tryagain = 1 if not defined $tryagain;

    my $request = HTTP::Request->new(GET => $location);
    $request->proxy_authorization_basic( $proxy{id}, $proxy{pass} )
        if defined $proxy{id};
    my $result = $ua->request($request);

    # if everything went right
    if ($result->is_success) {
        open FILE, ">" . $path . $links{$location} || die "couldn't op
+en $links{$location}";
        binmode(FILE);
        print FILE $result->content;
        close FILE;
        update_progress();
        return 1;
    }
    
    return get_image($location, 0) #try once more (default once)
        if $tryagain;
    return 0; # cop out
}

#feedit: the html (ref), images?, offsite?, depth
#receive: the names of pages yet to be goosed
#effect: change links of html
sub extract_links {
    my ($html, $images, $offsite, $depth, $master, $parent_location) =
+ @_;
    my @pages; # ones that still need to be goosed
    
    $$html =~ s{ ( <a \s+ [^>]+ > ) }
             { new_link($1, "href", $depth, $offsite, $master, $parent
+_location, \@pages, "html") }xgei;
    
    return @pages;
}

#feedit: html with img tags, master, parent location
#receive: array of images do download
#effect: none
sub extract_images {
    my ($html, $master, $parent) = @_;
    my @images;

    $$html =~ s{ ( <img \s+ [^>]+ > ) }
                 {new_link($1, "src", 1, 1, $master, $parent, \@images
+)}xgei;
    
    return @images
}

#feedit: link, type, depth, offsite, master, parent location, pages (r
+ef), $ext (opt)
#receive: a new link (possibly)
#effect: adds an entry to @pages if necessary
sub new_link {
    my ($tag, $type, $depth, $offsite, $master, $parent_location, $pag
+es, $ext) = @_;
      
    my $link = get_attribute($tag, $type) || return $tag;
    # if ext is provided, get the ext of the file being saved
    $ext ||= ($link =~ m!/[^/]+ \. (\w+)  (?: \? [\w%&;\s#=]+ )?$!x &&
+ $1);

    # if it's a mailto: link
    return $tag
        if $link =~ /^mailto:/;
    # if it's javascript
    return $tag
        if $link =~ /^javascript:/;
    
    my $fullpath = fullpath( $link, $parent_location );
    $fullpath =~ m!^(https?://[^/]+)!;
    my $root = $1;

    # if it's already been goosed or is queued up
    return set_attribute($tag, $type, $links{$fullpath})
        if defined $links{$fullpath};
    # if we're done goosing
    return set_attribute($tag, $type, $fullpath)
        if not $depth;
    # if it's an offsite link and we don't want it
    return $tag
        if lc($root) ne lc($master)
          && not $offsite;
    
    # else queue it up
    push @$pages, $fullpath;
    $links{$fullpath} = $filename++ . "." . $ext;
    
    return set_attribute($tag, $type, $links{$fullpath})
}

#feedit: location, location where found
#receive: full location
#effect: none
sub fullpath {
    my ($relative, $found) = @_;
    
    #if it's not a filename and doesn't have a /
    $found .= "/"
        if $found !~ m!https?://.*/.*!i;
    
    return $relative
        if $relative =~ m!^https?://!i;
    $found =~ s![^/]+$!!; # remove filename at end
    return $1 . $relative    # /foo/bar
        if $relative =~ s/^\///
           && $found =~ m!^(https?://.+?/)!i;
    $found =~ s![^/]+/$!! # for relative urls (../whatever)
        while $relative =~ s!^\.\./!!;
    1 while $relative =~ s!^\./!!; # for urls relative to the current 
+directory ( ./whatever)
    return $found . $relative;
}

#feedit: a tag, attribute
#receive: value of attribute
#effect: none
sub get_attribute {
    my ($tag, $attribute) = @_;
    
    # normal tag
    return $2
        if $tag =~ /\Q$attribute\E \s* = \s* (['"]) (.+?) \1/xi;
    
    # tag with no quotes
    return $1
        if $tag =~ /\Q$attribute\E=([^\s>]+)/;
    
    # else
    return 0;
}

#feedit: tag, attribute, value
#receive: tag
#effect: none
sub set_attribute {
    my ($tag, $attribute, $value) = @_;

        # normal tag
    return $tag
        if $tag =~ s/(\Q$attribute\E \s* = \s* (['"])) .+? ( \2)/$1$va
+lue$2/xi;
    
    # tag with no quotes
    return $tag
        if $tag =~ s/(\Q$attribute\E=)[^\s>]+/$1$value/i;
    
    # else
    die "\nERROR: couldn't set attribute '$attribute' to '$value' for 
+'$tag'\n";
}

update_progress();
my $count = goose( location => $href,
                depth    => $depth,
                offsite   => $offsite,
                images  => $images,
                path         => $path,
                compact => $compact) || die " it didn't work!";
print "\rGoosing $href... Done",
    " " x (length($count) + 6),
    "\n$count files received\n";
[download]
In reply to goose.pl by elusion
Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.

We don't bite newbies here... much
	PerlMonks