#!/bin/perl -w # converts XML data from UTF-8 back into latin1 # -r uses a regexp # -u uses Unicode::Strings # -i uses Text::Iconv (and the iconv library) # Note: -r does not work properly with XML::Parser 2.30 use strict; use XML::Parser; print "perl $] - XML::Parser $XML::Parser::VERSION\n"; my $filter; if( $ARGV[0] eq '-r') { $filter = \&latin1; } elsif( $ARGV[0] eq '-u') { $filter= unicode_convert( 'latin1'); } elsif( $ARGV[0] eq '-i') { $filter= iconv_convert( 'latin1'); } else { die "usage: $0 [-r|-u|-i]"; } # I like to escape as little characters as possible # but you might need to escape ' too (with ') my %ent=( '"' => '"', '<' => '<', '&' => '&'); my $p = new XML::Parser( Handlers => { Start => \&start, End => \&end, Default => \&default, }, filter => $filter, ); $p->parse( \*DATA); print "\n"; sub start { my( $p, $tag, %att)= @_; print '<', $p->{filter}->( $tag); while( my( $att, $val)= each %att) { print ' ', $p->{filter}->( $att), '="', $p->{filter}->( $val), '"'; } print '>'; } sub end { my( $p, $tag)= @_; print '{filter}->( $tag), '>'; } sub default { print $p->{filter}->( $_[0]->recognized_string()); } # shamelessly lifted from XML::TyePYX sub latin1 { my $text=shift; $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1); my $lo = ord($2); chr((($hi & 0x03) <<6) | ($lo & 0x3F)) }ge; return $text; } sub unicode_convert { my $enc= shift; require Unicode::Map8; require Unicode::String; import Unicode::String qw(utf8); my $sub= eval q{ { my $cnv; sub { $cnv ||= new Unicode::Map8 ($enc) or die "Can't create converter"; return $cnv->to8 (utf8($_[0])->ucs2); } } }; return $sub; } sub iconv_convert { my $enc= shift; require Text::Iconv; my $sub= eval q{ { my $cnv; sub { $cnv ||= new Text::Iconv( 'utf8', $enc) or die "Can't create converter"; return $cnv->convert( $_[0]); } } }; return $sub; } __DATA__ Un homme soupçonné d'être impliqué dans la mort d'un motard de la police, renversé