Beefy Boxes and Bandwidth Generously Provided by pair Networks
Do you know where your variables are?
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/local/bin/perl use warnings; use strict; use Cwd; use File::Copy; use Getopt::Long; use HTML::TokeParser::Simple 1.4; use Pod::Usage; my $VERBOSE = 1; GetOptions( 'help|?' => sub { pod2usage(-verbose => 2);exit } , 'config=s' => \my $CONFIG, 'backup=s' => \my $BACKUP, 'verbose!' => \$VERBOSE, 'quiet' => sub { $VERBOSE = 0 }, 'debug' => \my $DEBUGGING, 'ordered' => \my $ORDERED_ATTRIBUTES, 'ignore' => \my $IGNORE_ATTRIBUTES, 'text' => \my $MATCH_TEXT, 'fuzzy' => \my $FUZZY_MATCH ); backup_dir( $BACKUP ); $BACKUP .= '/' unless substr( $BACKUP, -1 ) =~ /\//; my %CONFIG = read_config( $CONFIG ); if ( $DEBUGGING ) { no warnings 'once'; require Data::Dumper; print Data::Dumper->Dump([\%CONFIG], ['*CONFIG']); } my %REPLACEMENTS; @ARGV = map { glob $_ } @ARGV; foreach my $file (@ARGV) { print "Processing ($file)...\n" if $VERBOSE; backup_file( $file, $BACKUP ); $REPLACEMENTS{$file} = 0; open HTML, '+<', $file or die "Can't open ($file) for updating: $!"; my $html = parse_document( *HTML,$file ); print "\t$REPLACEMENTS{$file} replacement(s) made to ($file)\n" if $ +VERBOSE; if ( $REPLACEMENTS{$file} ) { seek HTML, 0, 0 or die "Can't seek to start of ($file): + $!"; print HTML $html; truncate HTML, tell(HTML) or die "Can't truncate ($file): $!"; } if ( $DEBUGGING ) { print $html; } close HTML; } sub parse_document { my ($fh,$file) = @_; my $parser = HTML::TokeParser::Simple->new( $fh ); my $html = ''; while ( my $token = $parser->get_token ) { if( stacks_match( $parser, $CONFIG{stack} ) ) { $html .= $CONFIG{new}; $REPLACEMENTS{$file}++; } else { $html .= $token->as_is; } } return $html; } sub stacks_match { # if the stack matches the current token stream, return true and lea +ve the # parser at the end of the stream match. If it doesn't match, set t +he # parser to its original state and return false. my ($parser,$stack) = @_; my $stacks_match = 1; my @current_stack; for my $i ( 0 .. $#$stack ) { my $token = $parser->get_token; unless ($token) { # we've reached the end of the document and thus cannot match $parser->unget_token(@current_stack); return; } push @current_stack => $token; $stacks_match = tokens_match($token,$stack->[$i]); unless ($stacks_match) { # stacks didn't match. Restore state and return if ( $DEBUGGING ) { print "\n*** Current stack match failed:\n\n"; print Data::Dumper::Dumper(\@current_stack), "\n"; } $parser->unget_token(@current_stack); return; } } if ( $DEBUGGING ) { print "\n*** Matched this stack against config stack:\n\n"; print Data::Dumper::Dumper(\@current_stack), "\n"; } return 1; } sub munge_text { my $text = shift; $text =~ s/\W//g; return lc $text; } sub tokens_match { my ($token,$stack_token) = @_; if ( $token->[0] ne $stack_token->[0] ) { # token types did not match return; } elsif ($token->is_tag) { return token_as_string($token) eq token_as_string($stack_token); } elsif ($MATCH_TEXT) { my $curr_text = $token->return_text; my $stack_text = $stack_token->return_text; if ($FUZZY_MATCH) { $curr_text = munge_text( $curr_text ); $stack_text = munge_text( $stack_text ); } if ( $curr_text =~ /\S/ or $stack_text =~ /\S/ ) { return $curr_text eq $stack_text; } } else { # we're ignoring whatever it is, so it's an automatic match return 1; } } sub read_config { my $file = shift; my %allowed = map {$_=>1} qw(old new); open CONFIG, "<", $file or die "Cannot read ($file): $!"; my %config; local $_; my ($section,$old_section) = ('',''); while (<CONFIG>) { next unless /\S/; if ( /^\s*\[([^\]]+)\]\s*$/ ) { # [$section] $section = $1; die "Unknown section ($section) in config file ($file)" unless exists $allowed{$section}; $config{$section} = ''; next; } $config{$section} .= $_; } chomp foreach values %config; close CONFIG; return add_stack( %config ); } sub add_stack { my %config = @_; my $html = $config{old}; $config{stack} = []; my $parser = HTML::TokeParser::Simple->new( \$html ); while ( my $token = $parser->get_token ) { push @{$config{stack}} => $token; } delete $config{old}; return %config; } sub token_as_string { # so far, this is fairly simple. It merely "stringifies" the tag ty +pe # and attributes. This may change in the future. my $token = shift; my $sequence = $token->return_attrseq; return canonical_tag($token) unless $sequence && ! $IGNORE_ATTRIBUTE +S; @$sequence = sort @$sequence unless $ORDERED_ATTRIBUTES; my $attributes = $token->return_attr; my $results = ''; foreach my $attr (@$sequence) { $results .= $attr . $attributes->{$attr}; } return canonical_tag($token).$results; } sub backup_dir { my $dir = shift || die pod2usage(); unless ( -d $dir ) { mkdir $dir or die "Could not makedir ($dir): $!"; } } sub canonical_tag { # prepends a backslash my $token = shift; my $tag = $token->return_tag; $tag = "/$tag" if $token->is_end_tag and '/' ne substr $tag, 0, + 1; return $tag; } sub backup_file { my ($file,$backup) = @_; if ( -e $file ) { copy( $file, "$backup$file" ) or die "Could not copy ($file) to ($backup): $!";\ return 1; } else { warn "\tWARNING: File ($file) does not exist in (",cwd,")\n"; return; } } __END__ =head1 NAME htmlreplace -- A simple HTML replacement tool =head1 SYNOPSIS B<htmlreplace --help> for more information htmlreplace [options] [filenames] Options: --help Display POD --? Same as --help --verbose List files while processing them (default) Will also list number of substitutions made. --noverbose Turn off --verbose --quiet Same as --noverbose --config *file* Location of config file --backup *dir* The directory to back up the files to --text Match text (default is off -- only check struct +ure) --fuzzy Same as --text, but matching is more robust (se +e below) --ignore Use this to ignore attributes --ordered If specified, attributes must appear in the sam +e order in both the config html and the target ht +ml. Default is unordered. --debug This will dump the config token stack to STDOUT +. =head1 OVERVIEW This program allows a the user to create a simple configuration file t +hat will define HTML snippets and the replacement text for them. Then, a list +of file names will be iterated over, checking the HTML and if any correspondin +g HTML is found, will replace the HTML as specified in the config file =head1 DESCRIPTION =head2 Configuration File The config file takes two tokes, which should be on lines by themselve +s, C<[old]> and C<[new]>. After the C<[new]> token, add the text that yo +u wish to replace the HTML with. After the C<[old]> token, add the HTML that th +e program must search for and replace. An example configuration file named I<copyright.cfg>: [old] <!--#include virtual="includes/copyright" --> [new] <table> <tr> <td class="copyright">&copy; 2002 by Some Company, INC.</td> </tr> </table> This program parses the HTML into tokens, so whitespace is not importa +nt with tags. To use the above config file with a backup directory named I<old>: htmlreplace --config copyright.cfg --backup old *.html Command line options may also be shortened to the smallest number of l +etters necessary to distinguish them from other options. Thus, the above can + be written as follows: htmlreplace -c copyright.cfg -b old *.html =head2 Attribute handling Attributes are the name/value pairs associated with HTML start tags. +For example, the following tag has an attribute name of I<class> with a va +lue of I<foobar>. <p class="foobar"> Many tags will have multiple attributes. <img src="check.gif" height="13" width="12" alt="Check Gif"> By default, attributes of start tags do not need to appear in the same + order in the sample HTML and the HTML being examined. The following two C<i +mg> tags are equivalent: <img src="foo" height="13" width="13" alt="foo"> <img height="13" width="13" alt="foo" src="foo"> If attribute order is important, you may use the I<--ordered> switch ( +I<-o>). htmlreplace --ordered --config copyright.cgi --backup old *.html If you wish to ignore attributes and simply ensure that the order of t +he tags is correct, use the I<--ignore> (I<-i>) switch. htmlreplace --ordered --ignore --config copyright.cgi --backup old *. +html =head2 Matching Text By default, C<htmlreplace> only matches the structure of the document, + not the text. To match text, use the I<--text> (I<-t>) switch. This will ens +ure an exact match of the text (but skips any text that is pure whitespace). If the text might be a little off, such as unusual capitalization, ext +ra white space, etc., you can use the I<--fuzzy> (I<-f>) switch instead. This +will match text if the source and target texts match after all "non-word" (C</\W/>) characters are removed and all letters have been lower-cased +. =head1 COPYRIGHT Copyright (c) 2001 Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify i +t under the same terms as Perl itself.

In reply to Bulk HTML Munging by Ovid

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others exploiting the Monastery: (8)
As of 2024-04-23 07:49 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found