http://qs321.pair.com?node_id=485440
Category: HTML Utility
Author/Contact Info parv at pair dot com.
Description:

I had enough of mixed case titles in my bookmarks. Compared to upper case characters, lower case characters occupy less screen space in menus (in variable width font). Thus, the genesis of the following program.

Using HTML::TreeBuilder, following program lcs the link text. In addition, one can specify one's own code to be run on each text and URL; use -text and -url options.

Since there are some known bugs in the HTML::TreeBuilder module, running tidy on the output (file) is recommended.

UPDATE, Aug 21 2005:
-- Corrected error in code string compilation (related to symbolic references (oops!)).
-- Moved the options handling near the end.

use warnings; use strict;

use Getopt::Long
  qw( :config  gnu_compat no_ignore_case
      no_debug
    ) ;

my %opt =
 ( 'clobber' => 0
 , 'backup-suffix' => '--OLD'
 , 'text' => 'lc $_[0]'
 , 'url'  => undef
 ) ;
my @files = handle_options();

#  Compile code strings.
foreach my $c qw(text url)
{
  next unless $opt{$c};
  $opt{$c} = eval "sub { $opt{$c} }";
}

#  Use the File::Temp in non-OO mode to get both the filename & the
#  filehandle; OO mode can return only the filename.
use File::Temp qw( tempfile );
use File::Copy;
use HTML::TreeBuilder;

foreach my $f ( @files )
{
  my $message = sanity_check($f);
  if ( $message ) { warn $message; next; }

  my $tree = make_html_tree($f);
  #  Make changes.
  foreach my $node ( $tree->look_down('_tag' , 'a') )
  {
    #  Change link text.
    if ( ref $opt{'text'} && scalar $node->content_refs_list)
    {
      foreach my $text ( $node->content_refs_list  )
      {
        next if ref $$text;
        $$text = $opt{'text'}->($$text);
      }
    }

    #  Change link URL.
    my $url = $node->attr('href');
    $node->attr( 'href' , $opt{'url'}->($url) )
      if ref $opt{'url'} && $url ;
  }

  #  Save changes.
  my ($fh , $new_name) = open_new_file($f);
  print $fh $tree->as_HTML;
  $tree = $tree->delete;

  close $fh or die "Could not close $new_name\n";
}

exit;

#  Given a file name, returns the  HTML::TreeBuilder object.
sub make_html_tree
{
  my ($file) = @_;
  my $tree = HTML::TreeBuilder->new;
  $tree->warn(1);
  #  Configure parsing.
  $tree->no_space_compacting(1);
  $tree->ignore_ignorable_whitespace(0);
  $tree->ignore_unknown(0);
  $tree->store_comments(1);
  $tree->store_declarations(1);
  $tree->store_pis(1);
  #
  $tree->parse_file($file);
  $tree->eof;
  $tree->elementify;
  return $tree;
}

#  Given a file name, checks if it is plain readable file and if clobb
+ering
#  specified if it is writable too.  Returns undef if sanity is ok.
sub sanity_check
{
  my ($f) = @_;
  stat $f;
  return
    !( -f _ && -r _ )
    ? "'$f' skipped, either it is not a plain file or is unreadable\n"
    : $opt{'clobber'} && ! -w _
      ? "'$f' skipped, clobbering specified but file is not writable\n
+"
      : undef;
}

#  Returns file handle (to a new file) and optionally a file name.
sub open_new_file
{
  my ($old) = @_;
  my @open = $opt{'clobber'} ? tempfile() : backup_and_open($old);

  #  Return file handle in scalar context.
  return wantarray ? @open : $open[0];
}

#  Copies the given file to backup; opens the copy and returns the fil
+e handle
#  and the given file name (to be consistent w/ File::Temp::tempfile()
+).
{
  my $FH;
  sub backup_and_open
  {
    my ($old) = @_;
    my $backup = $old . $opt{'backup-suffix'};
    copy $old , $backup or die "Could not make backup: $!";

    open $FH, '>'  , $old or die "Could not open '$old' to write: $!";
    return ($FH , $old);
  }
}

#  Handle options.
sub handle_options
{
  GetOptions
  ( 'help|usage' => \$opt{'usage'}
  , 'no-clobber' => \$opt{'clobber'}
  , 'clobber'    => \$opt{'clobber'}
  , 'i|suffix'   => \$opt{'backup-suffix'}
  , 'text=s'     => \$opt{'text'}
  , 'url=s'      => \$opt{'url'}
  )
    or die 'Wrong option(s) given' , options_doc();
  if ( $opt{'usage'} )
  {
    print "$0 [options] file [file file ...]\n" , options_doc();
    exit 0;
  }
  die "Give at least one HTML file to work with.\n"
    unless scalar @ARGV;

  #  Files to work on.
  return @ARGV;
}

#  Document the options.
sub options_doc
{
  return <<_USAGE_;
 Options are ...

    -help       Show this message.

  File Output:

    -no-clobber Save the original files as a backups. (Default)
    -clobber    Overwrite the given files.  This overrides the
                -no-clobber option.
    -suffix     Text to append at the end of backup. (Default is
                '--OLD'.)

  Code (given as a string which is run inside a sub{}; last value
  returned from the sub is used as then new value for text or URL):

    -text       Code to run over text of each link.  (Default is to
                lowercase the text.)
    -url        Code to run over url of each link.  (Default is
                nothing to do.)
_USAGE_
}