A hobby of mine is translating Jack Vance into Esperanto. And yes, I have permission for this! These I distribute for free in EPub format. In each ebook I like to embed a mini linked-in dictionary to help out beginners.
I was wanting to re-organize some standalone EPubs into one omnibus EPub. I wanted one end-of-book dictionary instead of six end-of-chapter ones. That meant re-alphabetizing hundreds of anchor links. No big deal to do it in Perl...except that it's Unicode...and Esperanto. Here's how I did it.
#!/usr/bin/perl
# Program reads in any and all files named "Vortaro_.*" from the curre
+nt directory.
# It assumes each file to contain one or more jumbled dictionaries in
+Esperanto from
# some EPub at Eldonejo Mistera Sturno. For each file read in, a new f
+ile "new_Vortaro_.*\.txt"
# is written, no longer jumbled but fully alphabetized. It may, howeve
+r, have some duplicates.
# This is because the word was defined plurally in the jumble. A list
+of these duplicates will
# be listed in a file "keys_plural.txt" as a guide to manual sorting o
+ut.
use utf8;
use open qw(:std :utf8);
use Cwd;
###################
# BEGIN USER DEFS #
###################
# Where things are.
my $dir_input = './';
my $dir_output = './';
# Regular Expression variables
my $regex_0 = qr(Vortaro_.*); # Input files to parse
my $regex_1 = qr(<p class="left"><a id="[A-Z]_[0-9]+_" href="5.2-[0-9]
+{2}.xhtml#_[0-9]+_"><b>); # Left boundary of definition word.
my $regex_2 = qr(<.*); # Right boundary of definition word.
#################
# END USER DEFS #
#################
my @file_list = ();
# The array of all characters in Esperanto which are to factor in alph
+abetic sorting.
my @zam = qw( / A B C Ĉ D E F G Ĝ H Ĥ I J Ĵ K L M
+N O P R S Ŝ T U Ŭ V Z a á b c ĉ d e é f g ĝ h 
+93; i ï j ĵ k l m n o ó p r s ŝ t u ŭ ú v z );
# An equal-or-larger array corresponding to the above but in ASCII sor
+ting order.
# Duplicates exist to make the sorting not case sensitive.
my @abc = qw( / 0 2 3 4 5 6 8 9 A B C D F G H I J K L N O P Q R S U V
+W 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X
+ Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z );
my %key_ct = {};
# Find that char in @abc holding same position as a given Unicode char
+ present in @zam.
sub zam_abc {
my ($a) = @_;
my $i = 0;
for my $z ( @zam ) {
last if $a eq $z;
++$i;
}
return $abc[$i];
}
# Translate an Eo word into sortable gibberish that works with Perl.
sub sortable_key {
my $a = shift;
my $z = '';
for my $x (split //, $a) {
$z .= zam_abc($x);
}
if ( exists $key_ct{$a} ) {
$key_ct{$a} += 1; # Inc tally.
} else {
$key_ct{$a} = 0; # Init new tally.
};
return $z . sprintf("_%02d", $key_ct{$a});
}
# Print to file a list of overlapping, plural keys.
sub list_plural_keys {
open my $fh_out_2, '>', "$dir_output/keys_plural.txt" or die $!;
for my $key (@keys) {
if ($key_ct{$key} > 0) {
print $fh_out_2 $key . " = $key_ct{$key}\n";
}
}
}
# Parse out input vortaro for definitions.
sub parse_file {
my ($fh_in, $fh_out_1) = @_;
my %def_links;
my @def_keys;
while (<$fh_in>)
{
next unless $_ =~ m/b/;
my $def_line = my $def_word = $_;
my $def_word = get_word($def_line);
my $def_key = sortable_key($def_word);
$def_links{$def_key} = $def_line;
push @def_keys, $def_key;
}
@def_keys = sort @def_keys;
+
for (@def_keys) {
print $fh_out_1 "\n" . $def_links{$_};
}
}
# Extract the defined word for use as a key.
sub get_word {
my $str = shift;
# $str =~ s/<p class="left"><a id="._[0-9]+_" href="1-.\xhtml#_[0-
+9]+_"><b>//;
$str =~ s/$regex_1//;
$str =~ s/$regex_2//;
return $str;
}
chdir cwd();
# Find and process all vortaro files.
opendir $dh, $dir_input or die "Oops! Cannot open $dir_input directory
+.\n";
my @dir_list = readdir $dh;
closedir $dh;
for (@dir_list) {
next unless $_ =~ /$regex_0\.txt$/;
next if $_ =~ /^new_/;
push @file_list, $_;
}
# Merge plural vortaroj into single, interleaved vortaro.
# Merged vortaro may contain duplicate entries.
# Gives second output file listing those duplicate entries.
for my $file_in (@file_list) {
my ($fh_in, $fh_out_1);
if (open $fh_in, '<:encoding(UTF-8)', $file_in) {
my $file_out = 'new_' . $file_in;
if (open $fh_out_1, '>', $file_out) {
} else {
print "Oops! Can't write to '$file_out'.\n";
}
print "Busy parsing '$file_in' ... \n";
parse_file($fh_in, $fh_out_1);
print "Output = '$file_out' \n\n";
close $fh_in;
close $fh_out_1;
} else {
print "Oops! Can't read from '$file_in'.\n";
}
list_plural_keys($fh_out_2);
print "All done.\n";
}
__END__
RegEx Puzzle Area
Mazirien la Magiisto
<p class="left"><a id="F_0540_" href="1-6.xhtml#_0540_"><b>skarlat/o</
+b></a> Brilega sangoruĝa koloro.</p>
<p class="left"><a id="._[0-9]+_" href="1-.\xhtml#_[0-9]+_"><b>
Domo de'l Se
<p class="left"><a id="A_0530_" href="5.2-01.xhtml#_0530_"><b>delekt/i
+</b></a> (tr) Tre plezurigi.</p>
<p class="left"><a id="[A-Z]_[0-9]+_" href="5.2-[0-9].xhtml#[0-9]+_"><
+b>