#!/usr/local/bin/perl
use warnings;
use strict;
use Cwd;
use File::Copy;
use Getopt::Long;
use HTML::TokeParser::Simple 1.4;
use Pod::Usage;
my $VERBOSE = 1;
GetOptions(
'help|?' => sub { pod2usage(-verbose => 2);exit } ,
'config=s' => \my $CONFIG,
'backup=s' => \my $BACKUP,
'verbose!' => \$VERBOSE,
'quiet' => sub { $VERBOSE = 0 },
'debug' => \my $DEBUGGING,
'ordered' => \my $ORDERED_ATTRIBUTES,
'ignore' => \my $IGNORE_ATTRIBUTES,
'text' => \my $MATCH_TEXT,
'fuzzy' => \my $FUZZY_MATCH
);
backup_dir( $BACKUP );
$BACKUP .= '/' unless substr( $BACKUP, -1 ) =~ /\//;
my %CONFIG = read_config( $CONFIG );
if ( $DEBUGGING ) {
no warnings 'once';
require Data::Dumper;
print Data::Dumper->Dump([\%CONFIG], ['*CONFIG']);
}
my %REPLACEMENTS;
@ARGV = map { glob $_ } @ARGV;
foreach my $file (@ARGV) {
print "Processing ($file)...\n" if $VERBOSE;
backup_file( $file, $BACKUP );
$REPLACEMENTS{$file} = 0;
open HTML, '+<', $file or die "Can't open ($file) for updating: $!";
my $html = parse_document( *HTML,$file );
print "\t$REPLACEMENTS{$file} replacement(s) made to ($file)\n" if $
+VERBOSE;
if ( $REPLACEMENTS{$file} ) {
seek HTML, 0, 0 or die "Can't seek to start of ($file):
+ $!";
print HTML $html;
truncate HTML, tell(HTML) or die "Can't truncate ($file): $!";
}
if ( $DEBUGGING ) {
print $html;
}
close HTML;
}
sub parse_document {
my ($fh,$file) = @_;
my $parser = HTML::TokeParser::Simple->new( $fh );
my $html = '';
while ( my $token = $parser->get_token ) {
if( stacks_match( $parser, $CONFIG{stack} ) ) {
$html .= $CONFIG{new};
$REPLACEMENTS{$file}++;
}
else {
$html .= $token->as_is;
}
}
return $html;
}
sub stacks_match {
# if the stack matches the current token stream, return true and lea
+ve the
# parser at the end of the stream match. If it doesn't match, set t
+he
# parser to its original state and return false.
my ($parser,$stack) = @_;
my $stacks_match = 1;
my @current_stack;
for my $i ( 0 .. $#$stack ) {
my $token = $parser->get_token;
unless ($token) {
# we've reached the end of the document and thus cannot match
$parser->unget_token(@current_stack);
return;
}
push @current_stack => $token;
$stacks_match = tokens_match($token,$stack->[$i]);
unless ($stacks_match) {
# stacks didn't match. Restore state and return
if ( $DEBUGGING ) {
print "\n*** Current stack match failed:\n\n";
print Data::Dumper::Dumper(\@current_stack), "\n";
}
$parser->unget_token(@current_stack);
return;
}
}
if ( $DEBUGGING ) {
print "\n*** Matched this stack against config stack:\n\n";
print Data::Dumper::Dumper(\@current_stack), "\n";
}
return 1;
}
sub munge_text {
my $text = shift;
$text =~ s/\W//g;
return lc $text;
}
sub tokens_match {
my ($token,$stack_token) = @_;
if ( $token->[0] ne $stack_token->[0] ) {
# token types did not match
return;
}
elsif ($token->is_tag) {
return token_as_string($token) eq token_as_string($stack_token);
}
elsif ($MATCH_TEXT) {
my $curr_text = $token->return_text;
my $stack_text = $stack_token->return_text;
if ($FUZZY_MATCH) {
$curr_text = munge_text( $curr_text );
$stack_text = munge_text( $stack_text );
}
if ( $curr_text =~ /\S/ or $stack_text =~ /\S/ ) {
return $curr_text eq $stack_text;
}
}
else {
# we're ignoring whatever it is, so it's an automatic match
return 1;
}
}
sub read_config {
my $file = shift;
my %allowed = map {$_=>1} qw(old new);
open CONFIG, "<", $file or die "Cannot read ($file): $!";
my %config;
local $_;
my ($section,$old_section) = ('','');
while (<CONFIG>) {
next unless /\S/;
if ( /^\s*\[([^\]]+)\]\s*$/ ) { # [$section]
$section = $1;
die "Unknown section ($section) in config file ($file)"
unless exists $allowed{$section};
$config{$section} = '';
next;
}
$config{$section} .= $_;
}
chomp foreach values %config;
close CONFIG;
return add_stack( %config );
}
sub add_stack {
my %config = @_;
my $html = $config{old};
$config{stack} = [];
my $parser = HTML::TokeParser::Simple->new( \$html );
while ( my $token = $parser->get_token ) {
push @{$config{stack}} => $token;
}
delete $config{old};
return %config;
}
sub token_as_string {
# so far, this is fairly simple. It merely "stringifies" the tag ty
+pe
# and attributes. This may change in the future.
my $token = shift;
my $sequence = $token->return_attrseq;
return canonical_tag($token) unless $sequence && ! $IGNORE_ATTRIBUTE
+S;
@$sequence = sort @$sequence unless $ORDERED_ATTRIBUTES;
my $attributes = $token->return_attr;
my $results = '';
foreach my $attr (@$sequence) {
$results .= $attr . $attributes->{$attr};
}
return canonical_tag($token).$results;
}
sub backup_dir {
my $dir = shift || die pod2usage();
unless ( -d $dir ) {
mkdir $dir or die "Could not makedir ($dir): $!";
}
}
sub canonical_tag {
# prepends a backslash
my $token = shift;
my $tag = $token->return_tag;
$tag = "/$tag" if $token->is_end_tag and '/' ne substr $tag, 0,
+ 1;
return $tag;
}
sub backup_file {
my ($file,$backup) = @_;
if ( -e $file ) {
copy( $file, "$backup$file" )
or die "Could not copy ($file) to ($backup): $!";\
return 1;
}
else {
warn "\tWARNING: File ($file) does not exist in (",cwd,")\n";
return;
}
}
__END__
=head1 NAME
htmlreplace -- A simple HTML replacement tool
=head1 SYNOPSIS
B<htmlreplace --help> for more information
htmlreplace [options] [filenames]
Options:
--help Display POD
--? Same as --help
--verbose List files while processing them (default)
Will also list number of substitutions made.
--noverbose Turn off --verbose
--quiet Same as --noverbose
--config *file* Location of config file
--backup *dir* The directory to back up the files to
--text Match text (default is off -- only check struct
+ure)
--fuzzy Same as --text, but matching is more robust (se
+e below)
--ignore Use this to ignore attributes
--ordered If specified, attributes must appear in the sam
+e
order in both the config html and the target ht
+ml.
Default is unordered.
--debug This will dump the config token stack to STDOUT
+.
=head1 OVERVIEW
This program allows a the user to create a simple configuration file t
+hat will
define HTML snippets and the replacement text for them. Then, a list
+of file
names will be iterated over, checking the HTML and if any correspondin
+g HTML
is found, will replace the HTML as specified in the config file
=head1 DESCRIPTION
=head2 Configuration File
The config file takes two tokes, which should be on lines by themselve
+s,
C<[old]> and C<[new]>. After the C<[new]> token, add the text that yo
+u wish to
replace the HTML with. After the C<[old]> token, add the HTML that th
+e program
must search for and replace.
An example configuration file named I<copyright.cfg>:
[old]
<!--#include virtual="includes/copyright" -->
[new]
<table>
<tr>
<td class="copyright">© 2002 by Some Company, INC.</td>
</tr>
</table>
This program parses the HTML into tokens, so whitespace is not importa
+nt with
tags.
To use the above config file with a backup directory named I<old>:
htmlreplace --config copyright.cfg --backup old *.html
Command line options may also be shortened to the smallest number of l
+etters
necessary to distinguish them from other options. Thus, the above can
+ be
written as follows:
htmlreplace -c copyright.cfg -b old *.html
=head2 Attribute handling
Attributes are the name/value pairs associated with HTML start tags.
+For
example, the following tag has an attribute name of I<class> with a va
+lue of
I<foobar>.
<p class="foobar">
Many tags will have multiple attributes.
<img src="check.gif" height="13" width="12" alt="Check Gif">
By default, attributes of start tags do not need to appear in the same
+ order
in the sample HTML and the HTML being examined. The following two C<i
+mg> tags
are equivalent:
<img src="foo" height="13" width="13" alt="foo">
<img height="13" width="13" alt="foo" src="foo">
If attribute order is important, you may use the I<--ordered> switch (
+I<-o>).
htmlreplace --ordered --config copyright.cgi --backup old *.html
If you wish to ignore attributes and simply ensure that the order of t
+he tags
is correct, use the I<--ignore> (I<-i>) switch.
htmlreplace --ordered --ignore --config copyright.cgi --backup old *.
+html
=head2 Matching Text
By default, C<htmlreplace> only matches the structure of the document,
+ not the
text. To match text, use the I<--text> (I<-t>) switch. This will ens
+ure an
exact match of the text (but skips any text that is pure whitespace).
If the text might be a little off, such as unusual capitalization, ext
+ra white
space, etc., you can use the I<--fuzzy> (I<-f>) switch instead. This
+will
match text if the source and target texts match after all "non-word"
(C</\W/>) characters are removed and all letters have been lower-cased
+.
=head1 COPYRIGHT
Copyright (c) 2001 Curtis "Ovid" Poe. All rights reserved.
This program is free software; you may redistribute it and/or modify i
+t under
the same terms as Perl itself.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.