#!/usr/bin/perl use strict; use Pod::Usage; use Getopt::Long; use Encode; my ( $ityp, $otyp ) = ( qw/u8 pe/ ); my $man = my $help = my $vctl = 0; my $okargs = GetOptions( 'help|?' => \$help, man => \$man, c => \$vctl, 'i=s' => \$ityp, 'o=s' => \$otyp ); pod2usage(-exitstatus => 0, -verbose => 2) if $man; pod2usage(1) if ( $help or $ityp !~ /^u[8fbln]/ or $otyp !~ /^u[8cfbln]|[dhpy]e$/ ); pod2usage(2) if ( ! $okargs or ( @ARGV == 0 and -t )); my $native = (pack("S",1) eq pack("v",1)) ? 'UTF-16LE' : 'UTF-16BE'; my %mode = ( ub => 'UTF-16BE', ul => 'UTF-16LE', un => $native, ); my %format = ( de => '&#%d;', he => '&#x%04X;', pe => '\x{%04x}', ye => '\u%04x', uc => 'U+%04X', ); my $replset = ( $vctl ) ? '[^\x09\x0a\x20-\x7e]' : '[^\x00-\x7f]'; my $replsub; $replsub = sub { local($_) = shift; s/($replset)/sprintf($format{$otyp},ord($1))/ge; $_ } if ( $otyp =~ /.[ec]/ ); my $imode = ( $ityp =~ /u[bln]/ ) ? $mode{$ityp} : 'utf8'; my $omode = ( $otyp =~ /u[bln]/ ) ? ":encoding($mode{$otyp})" : ':utf8'; binmode STDOUT, $omode; my $buffer; my %names; if ( $otyp =~ /uf/ ) { for ( split /^/, do 'unicore/Name.pl' ) { my @f = split( /\t/ ); if ( $f[1] eq '' ) { $names{$f[0]} = $f[2]; } else { $names{range}{$f[2]} = [ $f[0], $f[1] ]; } } } while (<>) { $_ = decode( $imode, $_ ); if ( $ityp =~ /uf/ ) { next unless ( /^([\da-f]{1,5})\s/i ); my $cp = chr(hex($1)); $buffer .= $cp; next unless ( $cp =~ /\n/ ); } elsif ( $imode eq 'utf8' ) { # check for ascii-fied encodings: s/\&#(\d+);/chr($1)/ge; # decimal char. entity s/\&#x([\da-f]+);/chr(hex($1))/ige; # hex char. entity s/\\u([\da-f]{4,5})/chr(hex($1))/ige; # Python hex notation s/\\x\{([\da-f]+)\}/chr(hex($1))/ige; # Perl hex notation s/U\+([\dA-Fa-f]{4,5})/chr(hex($1))/ge; # unicode.org notation } if ( length( $buffer )) { $_ = $buffer; $buffer = ''; } elsif ( defined( $replsub)) { $_ = &$replsub( $_ ) ; } if ( $otyp !~ /uf/ ) { print; } else { for my $c ( split // ) { my $h = sprintf( "%04X", ord( $c )); my $name = $names{$h} || get_range( $h ) || "undefined codepoint\n"; if ( $otyp eq 'uf' ) { $c =~ s/([\x00-\x1f\x7f])/sprintf("^%s",chr(ord($1)+64))/e; print "$h\t$c\t$name"; } else { print "$h\t$name"; } } } } sub get_range { my $h = shift; for ( keys %{$names{range}} ) { if ( $h ge $names{range}{$_}[0] and $h le $names{range}{$_}[1] ) { return $_; } } return; } =head1 NAME tlu -- transliterate unicode =head1 SYNOPSIS tlu [-i itype] [-o otype] [-c] [-help|-man] [ file ... ] =head1 OPTIONS itype may be one of: u8 : variable-width utf8 characters (default, works for ASCII) ul : fixed-width UTF-16LE characters (low byte first) ub : fixed-width UTF-16BE characters (high byte first) un : fixed-width UTF-16 characters (native byte order) uf : hex codepoint, utf8 char and full name (one char per line) otype may be one of: pe : Perl escape-character notation: \x{hhhh} (default) ye : Python escape-character notation: \uhhhh de : html decimal character entities: &#dddd; he : hexadecimal character entities: &#xhhhh; uc : Unicode codepoint notation: U+hhhh u8, ul, ub, un, uf : (same as for itype) ufx : same as "uf", but without utf8 char -c : apply output notation to ASCII control characters -help : show this synopsis -man : show full documention =head1 DESCRIPTION Input may be from STDIN or from one or more files named as args on the command line. Output is always to STDOUT. All input and output is treated as utf8, except when the input and/or output type is set to "ul", "ub" or "un", in which case the input(s) and/or STDOUT will be set to "UTF-16LE", "UTF-16BE", or the native byte order, respectively. This is a filter whose basic purpose is to put all non-ASCII Unicode content into a consistent format of the user's choice. The handling of input is as flexible and forgiving as possible. If you indicate specifically that the input should be treated as UTF-16 (either BE or LE), then we do need to be strict about that, but in the default case ("u8" for utf8/ASCII input), all of the following are accepted, in any combination (inputs can be heterogeneous): =over 4 =item * actual utf8 wide characters =item * html decimal numeric character entity references: Ӓ =item * html hexadecimal character entity references: ካ =item * Perl hexadecimal character escapes: \x{12ab} =item * Python hexadecimal character escapes: \u12ab =item * Unicode hexidecimal codepoint labels: U+12ab =back If you select "ul", "ub" or "un" as the output format, all characters will be converted to 16-bit values (each ASCII character will have a null high byte appended); selecting "u8" will produce proper utf8 output (characters in the ASCII range remain single-byte, non-ASCII characters are 2 to 4 bytes each). The "uf" format, for both input and output, uses a special treatment in which each character of data is on a sepearate line. By default, each line is formatted as follows: hhhh (tab) utf8_character (tab) FULL UNICODE CHARACTER NAME The "utf8_character" field uses caret-notation control codes for the "invisible" ASCII control codes, such as "^J" for line-feed, "^@" for null, etc. (ASCII "DELETE" -- 0x7f -- appears as "^" followed by an upside-down question mark.) Use "ufx" as the output format to exclude the utf8_character field from each line (print only the codepoint value and full character name, separated by tab). When using "uf" style as input, all that really matters is that the first token on each line be a valid hexadecimal codepoint value (that is, you don't need to worry about "uf" vs. "ufx" on input). All the other output options (de, he, pe, ye, uc) are guaranteed to generate ASCII-only text data, such that each non-ASCII Unicode character is represented by its corresponding (decimal or hex) numeric form with suitable punctuation. Conversion of wide characters to the chosen notation is the only modification applied to the data (that is, nothing else is added to the data, such as extra spacing to make it easier to read). The "-c" option is handy if your data includes any of the "invisble" ASCII control characters; it will cause these to be treated just like unicode wide characters: they will be made explicitly visible via the chosen output format (unless you've chosen "u8" for utf8 output). For "uf" style output, of course, these things would be visible in any case, because each character gets its own output line. =head1 AUTHOR David Graff =cut