http://qs321.pair.com?node_id=599702
Category: Web Stuff
Author/Contact Info wizbancp
Description: A script for exploring site and catch link simply specify the starting url and the searching depth (sorry for my english!:-)) at the end the script produce a text files with the address catched.
After the critics(:-)) i modified the script to catch only link address & don't also email.... =:-( usage: "script.pl url depth" or simply "script.pl"
#!/usr/bin/perl -w

require LWP::UserAgent;

open LINK,  ">", "link.txt";

if (!@ARGV)
{
    print "Insert starting URL: ";
    $indirizzo=<STDIN>;
    chomp($indirizzo);
    print "\nInsert searching depth: ";
    $profond=<STDIN>;
    chomp($profond);
}
else
{
    $indirizzo = $ARGV[0];
    $profond = $ARGV[1];
}

$indirizzohttp="http://".$indirizzo;
my @elencolink = $indirizzohttp;

my $ua = LWP::UserAgent->new; 
$ua->agent('WizCaptureBot/1.11');
$ua->timeout(10);
$ua->env_proxy;

sub pausa #pausing the script before ending
{
   print "\nPress Enter to exit.\n";
   my $pausa = <STDIN>;
} 


sub catturalink #procedure for url capture 
{
   my $codice = shift;
   my $cont = 0;
   
   while ($codice =~m/(http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@
+?^=%&:\/~\+#]*[\w\-\@?^=%&\/~\+#])?/g)
    {
       $indirizzolink="$&";
       $cont++;
       print LINK "$indirizzolink\n";
       push @elencolink, $indirizzolink;
    }
   
   print "Find $cont links\n";
} 

sub visitapagina #capture the site code
{
    my $pagina = shift;
    
    my $response = $ua->get("$pagina");
    if ($response->is_success)
    {
        $codicehtml = $response->content;
        print "\n -- $pagina --\n";
        catturalink($codicehtml);
    }
    else
    {
        print "\n -- $pagina --\n";
        print $response->status_line."\n";
    }
}

my $inizio=0;
my $fine=0;

visitapagina($elencolink[0]);
while($profond!=0)
{
    $profond--;
    $inizio=$fine+1;
    $fine = scalar(@elencolink)-1;
    for($c=$inizio; $c<=$fine; $c++)
    {
        print "\n$inizio  $c  $fine";
        visitapagina($elencolink[$c]);
    }
}

print"\n Operation ended! \n";
pausa;

close LINK;