A new version! 0.23
New parameter -r --render save the content on a temporary file and open it at the end of the program.
It creates only a similar page not identical, just to give you an idea of what you are fetching.
created a decent help. intercepted some possible "Illigal division by zero.." errors.
enjoy:
UPDATE january 31 2014 removed
$u=lc($u) beacuse is bugged: you CAN have uppercaseness in urls.
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use Time::HiRes qw(gettimeofday tv_interval);
use Getopt::Long;
use HTML::Parse;
## v 16 risolto bug di visualizzazione broken links
## problema con URL tipo http://213.203.143.54/ada_portale/index.php?o
+ption=com_frontpage&Itemid=1&lang=italian
# v 20 added force_ip_resolution
# v 22 rendering OK (in timelink e get_page) e formattazione %.6d per
+$tottime in report
# v 23 help added
$|++;
# options
my (@url, $count, $sleep, $timeout,$showhead,$protocol,$verbosity,$for
+ced_ip,$render);
#other global
my ($ssep, $bsep,%stat);
$bsep = ('=' x 68)."\n";
$ssep = ('-' x 68)."\n";
&helpme()if( (! GetOptions(
'count=n' => \$count,
'url=s' => \@url,
'sleep=n' => \$sleep,
'timeout=n' => \$timeout,
'header' => \$showhead,
'protocol=s' => \$protocol,
'verbosity=n' => \$verbosity,
'force_ip_resolution=s' => \$forced_ip,
'render' => \$render,
)) or (!$url[0]));
# some default values
$count ||= 1;
$sleep ||= 1;
$timeout||=15;
$verbosity = defined $verbosity ? $verbosity : 1 ;
$protocol||= 'http';
$protocol =~ s![:/]+!!g;
$forced_ip ||= undef;
$render ||=undef;
if ($render) { $count = 1; @url = qq($url[0])}
######################################################################
+##########
# LWP::UA initialize
my $ua = LWP::UserAgent->new;
$ua->agent("libwww-perl/5.10.1");
$ua->timeout($timeout);
######################################################################
+##########
foreach my $repetition(1..$count) {
foreach my $u (@url) {
unless ( $u =~ m/^$protocol/ ){$u=$protocol.'://'.$u};
$u =~s/\/$//; #removing an eventual / as last char
#$u=lc($u);####THIS IS BUGGED, dunno why i put it...
undef %stat;
%stat=(
sbody =>0, # size of the body skeleto
+n of each frame
sintlink =>0, # size internal linked con
+tent
sextlink =>0, # size external linked con
+tent
tbody =>0, # time taken by body skele
+ton of each frame
tintlink =>0, # time taken by internal c
+ontent
textlink =>0, # time taken by external c
+ontent
cintlink =>0, # count of internal link
cextlink =>0, # count of external link
brokenlink=>[], # broken links
mainurl => $u, # url got as arg
pages =>[$u], # main page or all the fra
+mes
cache =>{}, # cache for included resou
+rces
respcode =>undef, #
respmess =>undef, # some responses
resptitle =>undef, #
respserver=>undef, #
resplength=>undef, #
resphead =>undef, #
);
foreach my $page ( @{$stat{'pages'}})
{ &get_page($page, $forced_ip) }
foreach my $content_link (keys %{$stat{'cache'}})
{ &time_link($content_link) }
&report;
}
sleep ($sleep) unless $repetition == $count;
}
######################################################################
+##########
sub get_page {
my $urltoget = shift;
my $forced_ip = shift;
my $resp;
my $t0 = [gettimeofday];
if (defined $forced_ip ){
$urltoget =~ s/\w+:\/\///g;
$forced_ip = $protocol.'://'.$forced_ip;
$resp = $ua->get( $forced_ip , Host => $urltoget );
}
else {$resp = $ua->get($urltoget);}
$stat{tbody} += tv_interval ($t0, [gettimeofday]);
# add the content_length declared for EACH FRame/page loaded
$stat{resplength} += ($resp->content_length||0);
# add the real bytes length obtained for EACH FRame/page loaded
$stat{sbody} += (length ($resp->content)|| 0);
# add some more info only for the principal page (not for any fram
+e loaded)
$stat{respcode} = $stat{respcode} || $resp->code;
$stat{respmess} = $stat{respmess} || $resp->message;
$stat{resptitle} = $stat{resptitle} || $resp->title;
$stat{respserver} = $stat{respserver} || $resp->server;
$stat{resphead} = $stat{resphead} || $resp->headers_as_string;
# now parse the HTLM
my $parsed_html = parse_html($resp->content);
foreach my $link_found(@{ $parsed_html->extract_links(qw(body img
+src frame)) }) {
next if $$link_found[0] =~ /#/; # brutally skip anch
+ors
my $uriobj = URI->new( $$link_found[0]);
my $absurl = $uriobj->abs($urltoget);
#if is a frame add to pages adding an iteration to this s
+ub
if ($$link_found[3] eq 'frame') {push @{$stat{'pages'}}
+, "$absurl";next} #? need to stringify $absurl
#else is a content and we add this to the cache hash
else {$stat{cache}{ $absurl }=[] } # will store there
+length and time later on
}
if ($render){
mkdir "$ENV{TEMP}\\_temp_files"||die;
open RENDER, "> $ENV{TEMP}/_temp.html"|| die "unable to write
+to %TEMP%\\_temp.html";
(my $localcont = $resp->content ) =~s/src="([^"]*)\//src=".\/_te
+mp_files\//gm;
print RENDER $localcont;
close RENDER;
#system ("start $ENV{TEMP}/_temp.html");
}
}
######################################################################
+##########
sub time_link {
my $url = shift;
my $t0 = [gettimeofday];
my $resp = $ua->get($url);
my $ttaken = tv_interval ($t0, [gettimeofday]);
#$ttaken = sprintf ("%0.6f", $ttaken);
my $bytesrec = length($resp->content);
if ($resp->is_success()) {
@{$stat{'cache'}{$url}} = ($bytesrec, $ttaken );
#official regex from http://search.cpan.org/~gaas/URI-1.58/URI
+.pm
my($scheme, $domain, $path, $query, $fragment) =
$url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^
+#]*))?(?:#(.*))?|;
#tell internal from external content
if ($stat{mainurl} =~ /$domain/) {
$stat{sintlink} += $bytesrec ;
$stat{tintlink} += $ttaken ;
$stat{cintlink}++;
}
else {
$stat{sextlink} += $bytesrec ;
$stat{textlink} += $ttaken ;
$stat{cextlink}++;
}
if ($render){
(my $ele = $url )=~s/^.*\///;
#would be better convert html escapes like in WITH%2
+0SPACES.gif
open RENDER, "> $ENV{TEMP}\\_temp_files\\$ele"|| d
+ie "unable to write to %TEMP%\\_temp_files\\$ele";
binmode RENDER;
print RENDER $resp->content;
close RENDER;
}
}
else {
push @{$stat{'brokenlink'}},$url;
}
}
######################################################################
+##########
sub Arrotonda_Mega
{
my( $size, $n ) =( shift, 0 );
return "0 bytes" unless defined $size;
return "0 bytes" unless $size > 0;
++$n and $size /= 1024 until $size < 1024;
return sprintf "%.4f %s",
$size, ( qw[ bytes Kb Mb Gb ] )[ $n ];
}
######################################################################
+##########
sub report {
my $totsize = $stat{'sbody'} + $stat{'sintlink'} + $stat{'sex
+tlink'};
my $tottime = $stat{'tbody'} + $stat{'tintlink'} + $stat{'tex
+tlink'};
$tottime = sprintf ("%0.6f", $tottime);
my $forcedtxt = defined $forced_ip ? " \@ $forced_ip\t" :
+ "\t" ;
####################
if ($verbosity == 0){
print scalar localtime (time),"\t",
$stat{mainurl},$forcedtxt,$stat{respcode},"\t",
$totsize,"\t",
$tottime,"\t",
( $tottime ? &Arrotonda_Mega($totsize / $tottime)."/
+s" : 'NA' ),
,"\n";
}
####################
elsif ($verbosity == 1){
print $bsep,$stat{mainurl},$forcedtxt,scalar localtime (ti
+me),"\n",$ssep,
$stat{respcode},"\t",$stat{respmess}||'UNDEF',"\t",$
+stat{resptitle}||'UNDEF',"\n",
$ssep,
"downloaded ",&Arrotonda_Mega($totsize)," ($totsize
+bytes) in ",
$tottime," seconds (",
($tottime ? &Arrotonda_Mega($totsize / $tottime)."/s
+)" : 'NA'),
,"\n",$bsep;
}
####################
elsif ($verbosity > 1){
print $bsep,$stat{mainurl},$forcedtxt,scalar localtime (ti
+me),"\n",$ssep,
"Response code: ",$stat{respcode}||'UNDEF
+',"\n",
"Response message: ",$stat{respmess}||'UNDEF
+',"\n",
"Response server: ",$stat{respserver}||'UND
+EF',"\n",
"Response declared length: ",$stat{resplength}||'UND
+EF',"\n",
"Response title: ",$stat{resptitle}||'UNDE
+F',"\n",
$ssep,
"main page content (",scalar @{$stat{pages}},"):
+\t",&Arrotonda_Mega($stat{sbody}),
" in ",$stat{tbody}," seconds \@ ",
($stat{tbody} ? &Arrotonda_Mega($stat{sbody} / $
+stat{tbody})."/s)" : 'NA'),
"\n";
if ($verbosity > 2) {
print $ssep,"\tdetail of loaded pages (url):
+\n",$ssep,
"\t",join ("\n\t", @{$stat{pages}}),"\
+n",$ssep;
}
### report about extra downloaded content locale to $url
if ($stat{cintlink} > 0) {
print "included content ($stat{cintlink}):\t",&Arroto
+nda_Mega($stat{sintlink}),
" in ",$stat{tintlink}," seconds \@ ",
#&Arrotonda_Mega($stat{sintlink} / $stat{tintlin
+k}),"/s\n";
($stat{tintlink} ? &Arrotonda_Mega($stat{sintlin
+k} / $stat{tintlink})."/s)" : 'NA'),
"\n";
}
else {print "no included content found.\n"}
### report about extra downloaded content external to $ur
+l
if ($stat{cextlink} > 0) {
print "external content ($stat{cextlink}):\t",&Arrot
+onda_Mega($stat{sextlink}),
" in ",$stat{textlink}," seconds \@ ",
#&Arrotonda_Mega($stat{sextlink} / $stat{textlin
+k}),"/s\n";
($stat{textlink} ? &Arrotonda_Mega($stat{sextlin
+k} / $stat{textlink})."/s)" : 'NA'),
"\n";
}
else {print "no external content found.\n"}
### report about broken links
if (scalar @{$stat{brokenlink}} > 0) {
print "broken links found:\t",scalar @{$stat{broke
+nlink}},"\n",$ssep;
if ($verbosity > 2){
print "\tdetail of broken links (url):\n",
+$ssep,
#(join ("\n", @{$stat{brokenlink}})),$ssep
+;
(map {"\t".$_."\n"} @{$stat{brokenlink}}),
+$ssep;
}
}
else {print "no broken links found.\n",$ssep}
if ( ($verbosity > 2) && keys %{$stat{cache}} > 0) {
print "\tdetail of loaded content (url bytes
+ seconds):\n",$ssep,
(map { "\t$_ @{$stat{cache}{$_}}\n" }
+ sort keys %{$stat{cache}} ),
$ssep;
}
## total
print "downloaded ",&Arrotonda_Mega($totsize)," ($totsize
+ bytes) in ",
$tottime," seconds (",
( $tottime ? &Arrotonda_Mega($totsize / $tottime)."/
+s)" : 'NA' ),
"\n",$bsep;
}
# verbosity set to smtng strange switch to defaults
else{ $verbosity = 1; &report }
# eventually print headers
if ($showhead) {print "headers received:\n",$ssep,$stat{resph
+ead},"\n",$bsep;}
if ($render) {system ("start $ENV{TEMP}/_temp.html");}
}
######################################################################
+##########
sub helpme {
print <<EOH;
USAGE of $0:
--url URL [-u URL2 -u URL3 -v [0-3] -h -s n -c n -t n -p s -f IP -r]
-c --count how many times repeat the check. default is 1
-f --force_ip_resolution given a valid dotted decimal IP force on
+it the resolution of the url.
-h --header set to 1 to print header too. deefault to 0
-p --protocol http is assumed unless specified.
-r --render rend the page as it was parsed (giving just an idea
+of the original)
-s --sleep seconds between checks. default is 1.
-t --timeout the timeout for the connection. default to 15 second
+s
-u --url the url or urls we want to scan
-v --verbosity from 0 (one line output for each check) to 3. defau
+lt 1
NOTES:
--render set also --count to 0 and consider the first url only. I
+t does not replicate
the original page because does not take in count of css
+and other thinks.
--verbosity
0 the one line output containing this info: date, url, res
+ponse code, byte downloaded,
seconds elapsed, speed.
1 a more descrictive multiline output with title of the pa
+ge and bytes expressed in
human readable form too.
2 as 1 but with general infos about external or internal l
+oaded content and broken links.
3 as 2 with detailed info for each loaded content.
--protocol http is the default, you can specify ftp too. https is
+supported only if you have
installed Crypt::SSLeay module.
EXAMPLES:
-u www.perl.org -c 10 -s 5
check it 10 times sleeping 5 seconds between checks
-u www.perl.org -u search.cpan.org -c 10 -s 5
as above but checks two sites
-u www.perl.org -c 5000 -s 60 -v 0
a long running check with one line output
-u www.perl.org -h -v 3
a detailed check with header dumped
-u www.perl.org -f 127.0.0.1
try my local copy of this site..
-u 127.0.0.1 -p ftp
access the ftp running locally.
BUGS:
If your connection is too fast you can experience an 'Illigal divi
+sion by zero..'
error in speed calculation..;)
The process of downloading content and following links will be any
+way different from
a real browser. You cannot trust the size at byte resolution: ther
+e is no cache effect
between checks, no css download nor favicons. You'll get results v
+ery near the real
datum and speed quite realworld sized.
The rendering of the page with -r option is fairly inaccurate but
+serve to give an overall
idea of the page.
AUTHOR:
dis
cip
ulus a
t pmonks org
DISCLAIMER:
use at your own risk.
EOH
exit 0;
}
there are no rules, there are no thumbs..