Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

HTML Href attribute content replacer

by Genius (Beadle)
on Jun 06, 2001 at 01:37 UTC ( [id://86027]=perlcraft: print w/replies, xml ) Need Help??

   1: #!/usr/bin/perl -w
   2: 
   3: ######################################################################
   4: #
   5: #   Parse-href.pl      2001-05
   6: #
   7: #   Grab the content of all "href" attributes of the HTML "<a>" tag,
   8: #   Insert a redirection URL in the "href" and then UrlEncode the Old 
   9: #   Url to pass it in parameter.
  10: # 
  11: #   Greetings flys out to OeufMayo for his help.
  12: #
  13: #   Nicolas Crovatti  <ncrovatti@ifrance.com>
  14: #   http://www.gencoding.com
  15: #
  16: ######################################################################
  17: 
  18: 
  19: use strict;
  20: use URI::Escape;
  21: use warnings;
  22: use CGI ':standard';
  23: 
  24: my @forminputs;
  25: my $JavascriptLinks;
  26: my %results;
  27: my $name;
  28: my $html;
  29: my $OUTHTML;
  30: my $INHTML;
  31: my $dDate     = time();
  32: my $RedirURL  = "http://127.0.0.1/rapport/rnews.php?dt=" . $dDate . "&url=";
  33: my $File      = param("file");
  34: 
  35: {
  36:   package myParser;
  37:   use base qw(HTML::Parser);
  38:     sub start
  39:     {
  40:       # We fill Scalars using predefined array "@_"
  41:       ####################################################################
  42:       my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
  43:       my $at;
  44:       # For each <a> tag, we  grab the content of the "href" attribute, then we
  45:       # uri_escape it with URI::Escape module, and we rebuild the integrality 
  46:       # of the tag including our $RedirURL .
  47:       ##########################################################################
  48:       if ($tagname eq 'a'){
  49:         #Here, if we reach a <a> tag,i don't want to parse '<a href="#">' 
  50:         #links, we incremment the $LiensJavascript counter.
  51:         ###################################################################
  52:         if ($attr->{href} && $attr->{href} eq "#") {
  53:           $JavascriptLinks++;
  54:           print $origtext;
  55:         } else {
  56:             $attr->{href} =  $RedirURL . main::uri_escape($attr->{href}, "^A-Za-z0-9");
  57:             print '<a';
  58:             print qq' $_="$attr->{$_}"' foreach @{$attrseq};
  59:             print ">";
  60:         }
  61:       } else {
  62:         # If we don't find <a> tag, we print the original text
  63:         ######################################################
  64:         print $origtext;
  65:       }
  66:     }
  67:     sub end {
  68:       my ($self, undef, $origtext) = @_;
  69:       print $origtext;
  70:     }
  71:     sub text
  72:     {
  73:       my ($self, $origtext) = @_;
  74:       print $origtext;
  75:     }
  76: }
  77: 
  78: # Getting all params
  79: ####################
  80: @forminputs=param();
  81: foreach $name (@forminputs){
  82:   $results{$name}=param($name);
  83: }
  84: 
  85: # Opening the HTML file in read mode only
  86: ############################################
  87: open(INHTML, "<$File") || die <<"EOT_";
  88:  -[Erreur prevue]---------------------
  89:  Usage :
  90:  perl parse.pl file=file_name.htm
  91:  $!
  92:  -------------------------------------
  93: EOT_
  94: 
  95: 
  96: open(OUTHTML, ">News-ok.htm") || die <<"EOT_";
  97:  -[Erreur]----------------------------
  98:  Cannot open file:
  99:  $!
 100:  -------------------------------------
 101: EOT_
 102: 
 103: # $html is filled with all the content of the input file
 104: ########################################################
 105: while ($_=<INHTML>){$html .= $_;}
 106: close INHTML;
 107: 
 108: 
 109: # Initialising the Parser
 110: #########################
 111: my $p = myParser->new();
 112: 
 113: # Parsing $html
 114: ###############
 115: select OUTHTML;
 116: $p->parse($html);
 117: select STDOUT;
 118: 
 119: close OUTHTML;
 120: 
 121: print "
 122:  -[Result ok]---------------
 123: 
 124:  Everything goes Ok!
 125:  
 126:  not modified links : $JavascriptLinks
 127:  
 128:  -[eot]-----------------------------
 129: ";
 130: exit;

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others wandering the Monastery: (5)
As of 2024-04-25 17:09 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found