here a 0.20 version with a minor but important improvement: as explained by
Corion walking in the cloister, you can force the UserAgent to ask for a domainname at a specifed IP addres.
I find this feature very handy so i added to my code: you can pass a new option
--forced_ip_resolution 1.2.3.4
achieve this. In all verbosity level of the program this is underlined after the url requested.
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use Time::HiRes qw(gettimeofday tv_interval);
use Getopt::Long;
use HTML::Parse;
## v 16 risolto bug di visualizzazione broken links
## problema con URL tipo http://213.203.143.54/ada_portale/index.php?o
+ption=com_frontpage&Itemid=1&lang=italian
$|++;
# options
my (@url, $count, $sleep, $timeout,$showhead,$protocol,$verbosity,$for
+ced_ip);
#other global
my ($ssep, $bsep,%stat);
$bsep = ('=' x 68)."\n";
$ssep = ('-' x 68)."\n";
&helpme()if( (! GetOptions(
'count=s' => \$count,
'url=s' => \@url,
'sleep=n' => \$sleep,
'timeout=n' => \$timeout,
'header' => \$showhead,
'protocol=s' => \$protocol,
'verbosity=n' => \$verbosity,
'force_ip_resolution=s' => \$forced_ip,
)) or (!$url[0]));
# some default values
$count ||= 1;
$sleep ||= 1;
$timeout||=15;
$verbosity = defined $verbosity ? $verbosity : 1 ;
$protocol||= 'http';
$protocol =~ s![:/]+!!g;
$forced_ip ||= undef;
######################################################################
+##########
# LWP::UA initialize
my $ua = LWP::UserAgent->new;
$ua->agent("libwww-perl/5.10.1");
$ua->timeout($timeout);
######################################################################
+##########
foreach my $repetition(1..$count) {
foreach my $u (@url) {
unless ( $u =~ m/^$protocol/ ){$u=$protocol.'://'.$u};
$u =~s/\/$//; #removing an eventual / as last char
$u=lc($u);
undef %stat;
%stat=(
sbody =>0, # size of the body skeleto
+n of each frame
sintlink =>0, # size internal linked con
+tent
sextlink =>0, # size external linked con
+tent
tbody =>0, # time taken by body skele
+ton of each frame
tintlink =>0, # time taken by internal c
+ontent
textlink =>0, # time taken by external c
+ontent
cintlink =>0, # count of internal link
cextlink =>0, # count of external link
brokenlink=>[], # broken links
mainurl => $u, # url got as arg
pages =>[$u], # main page or all the fra
+mes
cache =>{}, # cache for included resou
+rces
respcode =>undef, #
respmess =>undef, # some responses
resptitle =>undef, #
respserver=>undef, #
resplength=>undef, #
resphead =>undef, #
);
foreach my $page ( @{$stat{'pages'}})
{ &get_page($page, $forced_ip) }
foreach my $content_link (keys %{$stat{'cache'}})
{ &time_link($content_link) }
&report;
}
sleep ($sleep) unless $repetition == $count;
}
######################################################################
+##########
sub get_page {
my $urltoget = shift;
my $forced_ip = shift;
my $resp;
my $t0 = [gettimeofday];
if (defined $forced_ip ){
$urltoget =~ s/\w+:\/\///g;
$forced_ip = $protocol.'://'.$forced_ip;
$resp = $ua->get( $forced_ip , Host => $urltoget );
}
else {$resp = $ua->get($urltoget);}
$stat{tbody} += tv_interval ($t0, [gettimeofday]);
# add the content_length declared for EACH FRame/page loaded
$stat{resplength} += ($resp->content_length||0);
# add the real bytes length obtained for EACH FRame/page loaded
$stat{sbody} += (length ($resp->content)|| 0);
# add some more info only for the principal page (not for any fram
+e loaded)
$stat{respcode} = $stat{respcode} || $resp->code;
$stat{respmess} = $stat{respmess} || $resp->message;
$stat{resptitle} = $stat{resptitle} || $resp->title;
$stat{respserver} = $stat{respserver} || $resp->server;
$stat{resphead} = $stat{resphead} || $resp->headers_as_string;
# now parse the HTLM
my $parsed_html = parse_html($resp->content);
foreach my $link_found(@{ $parsed_html->extract_links(qw(body img
+src frame )) }) {
next if $$link_found[0] =~ /#/; # brutally skip anch
+ors
my $uriobj = URI->new( $$link_found[0]);
my $absurl = $uriobj->abs($urltoget);
#if is a frame add to pages adding an iteration to this s
+ub
if ($$link_found[3] eq 'frame') {push @{$stat{'pages'}}
+, "$absurl";next} #? need to stringify $absurl
#else is a content and we add this to the cache hash
else {$stat{cache}{ $absurl }=[] } # will store there
+length and time later on
}
}
######################################################################
+##########
sub time_link {
my $url = shift;
my $t0 = [gettimeofday];
my $resp = $ua->get($url);
my $ttaken = tv_interval ($t0, [gettimeofday]);
my $bytesrec = length($resp->content);
if ($resp->is_success()) {
@{$stat{'cache'}{$url}} = ($bytesrec, $ttaken );
#official regex from http://search.cpan.org/~gaas/URI-1.58/URI
+.pm
my($scheme, $domain, $path, $query, $fragment) =
$url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^
+#]*))?(?:#(.*))?|;
#tell internal from external content
if ($stat{mainurl} =~ /$domain/) {
$stat{sintlink} += $bytesrec ;
$stat{tintlink} += $ttaken ;
$stat{cintlink}++;
}
else {
$stat{sextlink} += $bytesrec ;
$stat{textlink} += $ttaken ;
$stat{cextlink}++;
}
}
else {
push @{$stat{'brokenlink'}},$url;
}
}
######################################################################
+##########
sub Arrotonda_Mega
{
my( $size, $n ) =( shift, 0 );
return "0 bytes" unless defined $size;
return "0 bytes" unless $size > 0;
++$n and $size /= 1024 until $size < 1024;
return sprintf "%.4f %s",
$size, ( qw[ bytes Kb Mb Gb ] )[ $n ];
}
######################################################################
+##########
sub report {
my $totsize = $stat{'sbody'} + $stat{'sintlink'} + $stat{'sex
+tlink'};
my $tottime = $stat{'tbody'} + $stat{'tintlink'} + $stat{'tex
+tlink'};
my $forcedtxt = defined $forced_ip ? "FORCED ON $forced_ip\t"
+ : '' ;
####################
if ($verbosity == 0){
print scalar localtime (time),"\t",
$stat{mainurl},"\t",$forcedtxt,$stat{respcode},"\t",
$totsize,"\t",
$tottime,"\t",
(&Arrotonda_Mega($totsize / $tottime) ),"/sec","\n";
+ #Illegal division by zero at C:\SCRIPTS\webTimeLoad15.pl line 153.
}
####################
elsif ($verbosity == 1){
print $bsep,$stat{mainurl},"\t",$forcedtxt,scalar localtim
+e (time),"\n",$ssep,
$stat{respcode},"\t",$stat{respmess}||'UNDEF',"\t",$
+stat{resptitle}||'UNDEF',"\n",
$ssep,
"downloaded ",&Arrotonda_Mega($totsize)," ($totsize
+bytes) in ",
$tottime," seconds (",
&Arrotonda_Mega($totsize / $tottime),"/s)\n",$bsep;
}
####################
elsif ($verbosity > 1){
print $bsep,$stat{mainurl},"\t",$forcedtxt,scalar localtim
+e (time),"\n",$ssep,
"Response code: ",$stat{respcode}||'UNDEF
+',"\n",
"Response message: ",$stat{respmess}||'UNDEF
+',"\n",
"Response server: ",$stat{respserver}||'UND
+EF',"\n",
"Response declared length: ",$stat{resplength}||'UND
+EF',"\n",
"Response title: ",$stat{resptitle}||'UNDE
+F',"\n",
$ssep,
"main page content (",scalar @{$stat{pages}},"):
+\t",&Arrotonda_Mega($stat{sbody}),
" in ",$stat{tbody}," seconds \@ ",
&Arrotonda_Mega($stat{sbody} / $stat{tbody}),"/s
+\n";
if ($verbosity > 2) {
print $ssep,"\tdetail of loaded pages (url):
+\n",$ssep,
"\t",join ("\n\t", @{$stat{pages}}),"\
+n",$ssep;
}
### report about extra downloaded content locale to $url
if ($stat{cintlink} > 0) {
print "included content ($stat{cintlink}):\t",&Arroto
+nda_Mega($stat{sintlink}),
" in ",$stat{tintlink}," seconds \@ ",
&Arrotonda_Mega($stat{sintlink} / $stat{tintlink
+}),"/s\n";
}
else {print "no included content found.\n"}
### report about extra downloaded content external to $ur
+l
if ($stat{cextlink} > 0) {
print "external content ($stat{cextlink}):\t",&Arrot
+onda_Mega($stat{sextlink}),
" in ",$stat{textlink}," seconds \@ ",
&Arrotonda_Mega($stat{sextlink} / $stat{textlink
+}),"/s\n";
}
else {print "no external content found.\n"}
### report about broken links
if (scalar @{$stat{brokenlink}} > 0) {
print "broken links found:\t",scalar @{$stat{broke
+nlink}},"\n",$ssep;
if ($verbosity > 2){
print "\tdetail of broken links (url):\n",
+$ssep,
#(join ("\n", @{$stat{brokenlink}})),$ssep
+;
(map {"\t".$_."\n"} @{$stat{brokenlink}}),
+$ssep;
}
}
else {print "no broken links found.\n",$ssep}
if ( ($verbosity > 2) && keys %{$stat{cache}} > 0) {
print "\tdetail of loaded content (url bytes
+ seconds):\n",$ssep,
(map { "\t$_ @{$stat{cache}{$_}}\n" }
+ sort keys %{$stat{cache}} ),
$ssep;
}
## total
print "downloaded ",&Arrotonda_Mega($totsize)," ($totsize
+ bytes) in ",
$tottime," seconds (",
&Arrotonda_Mega($totsize / $tottime),"/s)\n",$bsep;
}
# verbosity set to smtng strange switch to defaults
else{ $verbosity = 1; &report }
# eventually print headers
if ($showhead) {print "headers received:\n",$ssep,$stat{resph
+ead},"\n",$bsep;}
}
######################################################################
+##########
sub helpme {
print <<EOH;
Utilizzo di $0
$0 -u URL [-u URL2 -u URL3 --verbosity [0-3] --header --sleep n --coun
+t n --timeout n --protocol s --force_ip_resolution s]
EOH
exit 0;
}
there are no rules, there are no thumbs..