# SelTran.pm        selective translation                   31jan10waw

# based on reply to PerlMonks node #820537 (perlquestion):

=comment

Greetings to all,

I asked in the chat window several days ago about how to
accomplish this, and tye provided me a good answer using map
and sort. Unfortunately, my laptop crashed shortly
thereafter, and I lost his answer. (That'll teach me, ha!)
However, there are a couple of complicating factors that tye
may not have addressed even then, and I'm looking for wisdom
on a succinct and safe way of accomplishing this.

Here's what I have:

A file containing a tab-delimited list of words to exchange
for modern spellings/equivalents, followed by a third column
for any stopwords which should not have substitutions done
in them.

A file containing a list of files in which substitutions
must be made.  Over a hundred such files needing to be
updated.

The target language is Asian, where 1) there are no spaces
between words; and 2) the encoding will be UTF-8. (This is
significant, because any regexp must be sensitive to this,
or it will fail.)

Here's an "English-ised" example of the words list file:
WORD REPLACEMENT STOPWORDS
score       twenty      fourscore,scored,scores
core        center      encore,coregent
centre      center
travelled   traveled
hasn't      has not
Johann      John        Johannesburg

So, what I need to do is substitute each word in the first
column for the word(s) in the second column, except where
the word in the stopwords column is matched. While this
seems like a simple scenario, I'm struggling to wrap my
brain around it. I'm just beginning to grasp the concepts of
map and join, and their syntax, but would much appreciate
some ideas for how to accomplish this.

Blessings,

~Polyglot~

=cut

package SelTran;  { # private package scope

use warnings
    FATAL => 'all'
    ;
use strict;

use Exporter;
our $VERSION = '0.1.0';

our @EXPORT = qw();


# syntactic sugar per mjd.
sub Iterator (&) { return $_[0]; }

# example translation table:
# my @translate = (
#   #  insert...   for...         except in...
#   [ 'TWENTY',   'score',     qw(twoscore unscored? score[srd])   ],
#   [ 'CENTER',   'core',      qw(encore[sd]? score[sd]? core[rd]) ],
#   [ 'CENTERS',  'cores',     qw(encores scores)                  ],
#   [ 'JOHN',     'Johann',    qw(Johannesburg)                    ],
#   [ 'CENTER',   'centre',    ],
#   [ 'TRAVELED', 'travelled', ],
#   [ 'HAS NOT',  'hasn\'t',   ],
#   );

sub iter {

    my $class = shift;

    my ($ar_trans_def,  # ref. to array: trans. definition table
        ) = @_;

    my %replace = map @{ $_ }[1, 0], @$ar_trans_def;

    my $search =
        join ' | ',
        map  word_regex(@{ $_ }[1 .. $#{$_}]),
        sort { $b->[1] cmp $a->[1] }  # longest words first
        @$ar_trans_def
        ;

    return Iterator {
        (my $xlt = $_[0]) =~ s{ ($search) }{$replace{$1}}xmsg;
        return $xlt;
        }

    }


sub word_regex {

    my ($word,
        @stops,
        ) = @_;

    my $not_stopped =
        join ' ',
        map  not_stopped(@$_),
        map  [ m{ \A (.*) ($word) (.*) \z }xms ],
        @stops
        ;

    return "$not_stopped $word";

    }

sub not_stopped {

    my ($stop_prefix,  # always defined if word defined, maybe empty
        $word,         # word embedded in stop word
        $stop_suffix,  # always defined if word defined, maybe empty
        ) = @_;

    return '' unless defined $word and length $word;  # need len test?

    # convert word to placeholder (faster match?)
    $word = sprintf '.{%d}', length $word;

    # convert stop prefix, if any, to POSITIVE assertion.
    $stop_prefix = "(?<= $stop_prefix)" if length $stop_prefix;

    # NEGATIVE assert of stop prefix, word placeholder, stop suffix.
    return "(?! $stop_prefix $word $stop_suffix)";

    }

}  # end SelTran private scope


1;