#!/usr/bin/perl

use strict;
use Pod::Usage;
use Getopt::Long;
use Encode;

my ( $ityp, $otyp ) = ( qw/u8 pe/ );
my $man = my $help = my $vctl = 0;
my $okargs = GetOptions( 'help|?' => \$help, man => \$man, c => \$vctl,
                         'i=s' => \$ityp, 'o=s' => \$otyp );

pod2usage(-exitstatus => 0, -verbose => 2) if $man;
pod2usage(1) if ( $help or $ityp !~ /^u[8fbln]/ or $otyp !~ /^u[8cfbln]|[dhpy]e$/ );
pod2usage(2) if ( ! $okargs or ( @ARGV == 0 and -t ));

my $native = (pack("S",1) eq pack("v",1)) ? 'UTF-16LE' : 'UTF-16BE';
my %mode = ( ub => 'UTF-16BE',
             ul => 'UTF-16LE',
             un => $native,
            );
my %format = ( de => '&#%d;',
               he => '&#x%04X;',
               pe => '\x{%04x}',
               ye => '\u%04x',
               uc => 'U+%04X',
             );
my $replset = ( $vctl ) ? '[^\x09\x0a\x20-\x7e]' : '[^\x00-\x7f]';
my $replsub;
$replsub = sub { local($_) = shift; s/($replset)/sprintf($format{$otyp},ord($1))/ge; $_ }
    if ( $otyp =~ /.[ec]/ );

my $imode = ( $ityp =~ /u[bln]/ ) ? $mode{$ityp} : 'utf8';
my $omode = ( $otyp =~ /u[bln]/ ) ? ":encoding($mode{$otyp})" : ':utf8';
binmode STDOUT, $omode;
my $buffer;
my %names;
if ( $otyp =~ /uf/ ) {
    for ( split /^/, do 'unicore/Name.pl' ) {
        my @f = split( /\t/ );
        if ( $f[1] eq '' ) {
            $names{$f[0]} = $f[2];
        }
        else {
            $names{range}{$f[2]} = [ $f[0], $f[1] ];
        }
    }
}

while (<>) {
    $_ = decode( $imode, $_ );
    if ( $ityp =~ /uf/ ) {
        next unless ( /^([\da-f]{1,5})\s/i );
        my $cp = chr(hex($1));
        $buffer .= $cp;
        next unless ( $cp =~ /\n/ );
    }
    elsif ( $imode eq 'utf8' ) {   # check for ascii-fied encodings:
        s/\&#(\d+);/chr($1)/ge;                 # decimal char. entity
        s/\&#x([\da-f]+);/chr(hex($1))/ige;     # hex char. entity
        s/\\u([\da-f]{4,5})/chr(hex($1))/ige;   # Python hex notation
        s/\\x\{([\da-f]+)\}/chr(hex($1))/ige;   # Perl hex notation
        s/U\+([\dA-Fa-f]{4,5})/chr(hex($1))/ge; # unicode.org notation
    }

    if ( length( $buffer )) {
        $_ = $buffer;
        $buffer = '';
    }
    elsif ( defined( $replsub)) {
        $_ = &$replsub( $_ ) ;
    }
    if ( $otyp !~ /uf/ ) {
        print;
    }
    else {
        for my $c ( split // ) {
            my $h = sprintf( "%04X", ord( $c ));
            my $name = $names{$h} || get_range( $h ) || "undefined codepoint\n";

            if ( $otyp eq 'uf' ) {
                $c =~ s/([\x00-\x1f\x7f])/sprintf("^%s",chr(ord($1)+64))/e;
                print "$h\t$c\t$name";
            } else {
                print "$h\t$name";
            }
        }
    }
}

sub get_range
{
    my $h = shift;
    for ( keys %{$names{range}} ) {
        if ( $h ge $names{range}{$_}[0] and
             $h le $names{range}{$_}[1] ) {
            return $_;
        }
    }
    return;
}

=head1 NAME

tlu -- transliterate unicode

=head1 SYNOPSIS

tlu [-i itype] [-o otype] [-c] [-help|-man] [ file ... ]

=head1 OPTIONS

 itype may be one of:
   u8 : variable-width utf8 characters (default, works for ASCII)
   ul : fixed-width UTF-16LE characters (low  byte first)
   ub : fixed-width UTF-16BE characters (high byte first)
   un : fixed-width UTF-16 characters (native byte order)
   uf : hex codepoint, utf8 char and full name (one char per line)

 otype may be one of:
   pe : Perl escape-character notation: \x{hhhh} (default)
   ye : Python escape-character notation: \uhhhh
   de : html decimal character entities: &#dddd;
   he : hexadecimal character entities: &#xhhhh;
   uc : Unicode codepoint notation:      U+hhhh
   u8, ul, ub, un, uf : (same as for itype)
   ufx : same as "uf", but without utf8 char

 -c : apply output notation to ASCII control characters

 -help : show this synopsis
 -man  : show full documention

=head1 DESCRIPTION

Input may be from STDIN or from one or more files named as args on the
command line.  Output is always to STDOUT.  All input and output is
treated as utf8, except when the input and/or output type is set to
"ul", "ub" or "un", in which case the input(s) and/or STDOUT will be
set to "UTF-16LE", "UTF-16BE", or the native byte order, respectively.

This is a filter whose basic purpose is to put all non-ASCII Unicode
content into a consistent format of the user's choice.  The handling
of input is as flexible and forgiving as possible.  If you indicate
specifically that the input should be treated as UTF-16 (either BE or
LE), then we do need to be strict about that, but in the default case
("u8" for utf8/ASCII input), all of the following are accepted, in any
combination (inputs can be heterogeneous):

=over 4

=item * actual utf8 wide characters

=item * html decimal numeric character entity references: &#1234;

=item * html hexadecimal character entity references: &#x12ab;

=item * Perl hexadecimal character escapes: \x{12ab}

=item * Python hexadecimal character escapes: \u12ab

=item * Unicode hexidecimal codepoint labels: U+12ab

=back

If you select "ul", "ub" or "un" as the output format, all characters
will be converted to 16-bit values (each ASCII character will have a
null high byte appended); selecting "u8" will produce proper utf8
output (characters in the ASCII range remain single-byte, non-ASCII
characters are 2 to 4 bytes each).

The "uf" format, for both input and output, uses a special treatment
in which each character of data is on a sepearate line.  By default,
each line is formatted as follows:

  hhhh (tab) utf8_character (tab) FULL UNICODE CHARACTER NAME

The "utf8_character" field uses caret-notation control codes for the
"invisible" ASCII control codes, such as "^J" for line-feed, "^@" for
null, etc.  (ASCII "DELETE" -- 0x7f -- appears as "^" followed by an
upside-down question mark.)

Use "ufx" as the output format to exclude the utf8_character field
from each line (print only the codepoint value and full character
name, separated by tab).  When using "uf" style as input, all that
really matters is that the first token on each line be a valid
hexadecimal codepoint value (that is, you don't need to worry about
"uf" vs. "ufx" on input).

All the other output options (de, he, pe, ye, uc) are guaranteed to
generate ASCII-only text data, such that each non-ASCII Unicode
character is represented by its corresponding (decimal or hex) numeric
form with suitable punctuation.  Conversion of wide characters to the
chosen notation is the only modification applied to the data (that is,
nothing else is added to the data, such as extra spacing to make it
easier to read).

The "-c" option is handy if your data includes any of the "invisble"
ASCII control characters; it will cause these to be treated just like
unicode wide characters: they will be made explicitly visible via the
chosen output format (unless you've chosen "u8" for utf8 output).  For
"uf" style output, of course, these things would be visible in any
case, because each character gets its own output line.

=head1 AUTHOR

David Graff

=cut