Re: Getting the contents of any 'url'

I am not sure what you mean by 'bad links', whether you mean incorrect html or links with a 404 on the other side. if you are looking for 404's, you could try this.. it only checks the links directly off of the root page ($url)

#!/usr/bin/perl -w
use LWP::Simple;
use HTML::TokeParser;
use strict;

my $url="http://www.perlmonks.org";
my $content = get("$url");

my $parse = HTML::TokeParser->new(\$content);
my $testlink;
while (my $token = $parse->get_tag("a")) {
      # put the link and the text into variables
   my $link = $token->[1]{href} || "-";
   my $text=$parse->get_trimmed_text("/a");

      # if $link is fully qualified url
   if ($link=~ m/http\:\/\//i){
         # use LWP to get the link
      $testlink=get("$link");
      if ($testlink){
            # parse the title returned from the testlink for 404 or 
            # not found errs
         my $testparse=HTML::TokeParser->new(\$testlink);
         if ($testparse->get_tag("title")) {
            my $title = $testparse->get_trimmed_text;
            if (($title=~ m/not found/i) || ($title=~ m/404/)) {
               print "* $link ($text) is a bad link\n";
            } else {
               print "$link ($text) seems to be a good link\n";
            }
         }
      }
   } else {
         # guess at qualifiny url by adding $url to the front..
      $testlink=get("$url/$link");
      if ($testlink) {
         my $testparse=HTML::TokeParser->new(\$testlink);
         if ($testparse->get_tag("title")) {
            my $title = $testparse->get_trimmed_text;
            if (($title=~ m/not found/i) || ($title=~ m/404/)) {
               print "* $url/$link ($text) is a bad link\n";
            } else {
               print "$url/$link ($text) seems to be a good link\n";
            }
         }
      }
   }
}
exit;
[download]

you may also want to check out How to ask questions the smart way..
-p

Comment on Re: Getting the contents of any 'url' Download Code

Replies are listed 'Best First'.

Re: Re: Getting the contents of any 'url'
by tachyon (Chancellor) on Oct 04, 2001 at 02:29 UTC

Hi this is not the best way to do this. Some significant problems with your script are that you load the contern of the entire link and then parse this when you only want the header which will contain the HTTP response code in the status line. You might find this a bit better, it is much faster.

#!/usr/bin/perl -w
use strict;
use HTML::TokeParser;
use LWP::UserAgent;

my $ua      = LWP::UserAgent->new;
my $proxy   = 'http://proxy.hbt.tassie.net.au:8080';
my $netloc  = '';
my $realm   = '';
my $uname   = '';
my $pass    = '';
$ua->credentials($netloc, $realm, $uname, $pass);
$ua->proxy( 'http', $proxy );

my $url = 'http://www.perlmonks.com/';
$url .= '/' if $url =~ m|http://[^/]+$|i;  # add trailing / if forgott
+en
my ($root) = $url =~ m|(http://.*/)|i;
my $content = get( $url );

print "Checking links....\n";
my $parse = HTML::TokeParser->new( \$content );
while (my $token = $parse->get_tag('a')) {
   my $link = $token->[1]{href} || next;
   my $text=$parse->get_trimmed_text('/a');
   if ($link =~ m|^\s*mailto|i) {
      next;
   }
   elsif ($link=~ m|^\s*http://|i){
      print &test( $link, $text );
   } 
   else {
      $link =~ s|^\s*/||;
      print &test( $root.$link, $text );
   }
}

sub get {
    my $url = shift;
    print "Getting $url....";
    my $request = HTTP::Request->new( 'GET', $url );
    my $content = $ua->request( $request ); 
    print "$$content{_msg} $$content{_rc}\n";
  return $$content{_content};
}

sub test {
    my ( $url, $text ) = @_;
    my $request = HTTP::Request->new( 'HEAD', $url );
    my $content = $ua->request( $request );
  return  "$$content{_msg} $$content{_rc} ($text) $url\n";
}

__END__
Getting http://www.perlmonks.com/....OK 200
Checking links....
OK 200 (Frank) http://www.perlmonks.com/index.pl?node_id=966&lastnode_
+id=131
OK 200 (perlmonks) http://www.perlmonks.com/index.pl
OK 200 (login) http://www.perlmonks.com/index.pl?node=login&lastnode_i
+d=131
[snip]
[download]

cheers

tachyon

s&&rsenoyhcatreve&&&s&n.+t&"$'$`$\"$\&"&ee&&y&srve&&d&&print

[reply]
[d/l]


Clear questions and runnable code get the best and fastest answer
	PerlMonks