- write a perl text string to a disk file in encoding specified (none/ansi, utf-8, utf-16le, possibly others)
- read a disk file into a perl string, detecting the encoding and remebering it
- modify the perl string (could be done by an attached callback, as required by the application script)
- write the possibly modified perl string back to disk, using the remembered encoding
- preserve the Windows crlf sequences through the read - modify - write cycle
IOW, I want the simplicity of File::Slurp, with the added intelligence for automatic handling of the unicode encodings.
This would help me when writing scripts for maintenance of large number of text files (c/cpp source files, project files, xml config files, whatever)
which may use a variety of encodings.
I found in perl standard modules and in CPAN modules various unicode-related building blocks, but not the synthesis that I am looking for.
Therefore, I wrote a prototype module which implements, partially, what I want.
While my File::AnyEncoding::write_file for UTF-16LE test file correctly converts "\n" into 0D 00 0A 00,
my File::AnyEncoding::read_file for UTF-16LE does not convert this back to "\n" but to "\r\n", why ?
I did try to play with various settings of binmode, but I failed to obtain the desired result.
My File::AnyEncoding::write_file for UTF-8 test converts "\n" into 0A only, while I expect 0D 0A,
why is this and how to fix it ?
Same question with encoding NONE (by which I mean no BOM, utf-8 encoding).
My module File::AnyEncoding and a test file t\AnyEncoding.t are reproduced below.
They should be placed in subdirs
The module uses File::BOM and the test file uses File::Path and Data::HexDump.
#!/usr/bin/perl
=pod
SYNOPSIS
use File::AnyEncoding; # unicode file writer - reader
my $fun1 = new File::AnyEncoding('utf-16le');
my $text1 = "Hello world";
my $filepath1 = "AnyEncoding-test1.txt";
$fun1->write_file($filepath1, $text1); # writes file with specified
+encoding
my $fun2 = new File::AnyEncoding();
my $text2 = $fun2->read_file($filepath1); # remembers the encoding fo
+und in $filepath1
$text2 =~s/world/unicode/; # modify file contents
my $filepath2 = "AnyEncoding-test2.txt";
$fun2->write_file($filepath2, $text2); # writes file with encoding fo
+und in $filepath1
AUTHOR
Rudif c/o Perlmonks
=cut
package File::AnyEncoding;
use strict;
use File::BOM qw( :all );
our %supported_encoding = map { $_ => 1 } ( 'NONE', 'UTF-8', 'UTF-16LE
+' );
sub new {
my $class = shift;
my $enc = shift // 'utf8';
my $self = {};
bless $self, $class;
$self->set_encoding($enc);
return $self;
}
sub set_encoding {
my $self = shift;
my $enc = shift;
unless ( defined $enc && defined $supported_encoding{ $enc } ) {
$enc = 'NONE';
#warn "defaulting to $enc";
}
$self->{encoding} = $enc;
}
sub get_encoding {
my $self = shift;
$self->{encoding};
}
sub write_file {
my $self = shift;
my $filepath = shift;
my $text = join '', @_;
my $enc = $self->{encoding};
my $FH;
if ( $enc eq 'NONE' ) {
open $FH, ">", $filepath;
#open $FH, ">:raw:encoding(UTF-8):crlf:utf8", $filepath;
}
else {
open $FH, ">:raw:encoding($enc):crlf:utf8", $filepath;
print $FH "\x{FEFF}";
}
print $FH $text;
close $FH;
}
sub read_file {
my $self = shift;
my $filepath = shift;
open my $FH, '<:bytes', "$filepath";
my ( $enc, $spillage ) = get_encoding_from_filehandle($FH);
$enc = $self->set_encoding($enc);
if ( $enc eq 'NONE' ) {
#binmode $FH, ":encoding(UTF-8)";
close $FH;
open $FH, '<', "$filepath";
}
else {
binmode $FH, ":encoding($enc)";
}
my @lines = <$FH>;
close $FH;
wantarray ? @lines : join '', @lines;
}
1;
#!/usr/bin/perl
use strict;
$|++;
use lib('..');
use Data::HexDump;
use File::Path;
my $data = '.\data';
rmtree $data; # remove old data if any
mkdir $data;
use Test::More tests => 28;
use File::AnyEncoding; # under test
use_ok('File::AnyEncoding');
# test utf-16le encoding
{
# create object and write test file with specified encoding
my $encoding = 'UTF-16LE';
my $fan1 = File::AnyEncoding->new($encoding);
isa_ok( $fan1, 'File::AnyEncoding', '$fan1' );
is( $fan1->get_encoding(), 'UTF-16LE', "specified encoding" );
my $text1 = "Hello world \x{263A}\n\n";
my $file1 = "$data\\$encoding.txt";
$fan1->write_file( $file1, $text1 );
my $expected1 = <<HERE;
00000000 FF FE 48 00 65 00 6C 00 - 6C 00 6F 00 20 00 77 00 ..H.e.l.l
+.o. .w.
00000010 6F 00 72 00 6C 00 64 00 - 20 00 3A 26 0D 00 0A 00 o.r.l.d.
+.:&....
00000020 0D 00 0A 00 ....
HERE
is( my_hexdump($file1), $expected1, "write_file $file1" );
# reset encoding - should be detected in read_file
$fan1->set_encoding('NONE');
is( $fan1->get_encoding(), 'NONE', "default encoding" );
# read test file and remember encoding
my $text2 = $fan1->read_file($file1);
is( $fan1->get_encoding(), 'UTF-16LE', "detected encoding" );
my $expected2 = 'Hello world \x{263a}\x{d}\x{a}\x{d}\x{a}';
is( my_reasciify($text2), $expected2, "read_file $file1" );
my @lines2 = $fan1->read_file($file1);
is( scalar @lines2, 2, "read_file $file1" );
my $join2 = join( '', @lines2 );
is( my_reasciify($join2), $expected2, "read_file $file1" );
# modify text and write second file using the remebered encoding
$text2 =~ s/world/WORLD/;
$text2 =~ s/\x{263A}/\x{20AC}/;
( my $file2 = $file1 ) =~ s/.txt/-2.txt/;
$fan1->write_file( $file2, $text2 ); # writes file with remembe
+red encoding
my $expected3 = <<HERE;
00000000 FF FE 48 00 65 00 6C 00 - 6C 00 6F 00 20 00 57 00 ..H.e.l.l
+.o. .W.
00000010 4F 00 52 00 4C 00 44 00 - 20 00 AC 20 0D 00 0D 00 O.R.L.D.
+.. ....
00000020 0A 00 0D 00 0D 00 0A 00 ........
HERE
is( my_hexdump($file2), $expected3, "write_file $file1" );
}
# test utf-8 encoding
{
# create object and write test file with specified encoding
my $encoding = 'UTF-8';
my $fan1 = File::AnyEncoding->new($encoding);
isa_ok( $fan1, 'File::AnyEncoding', '$fan1' );
is( $fan1->get_encoding(), 'UTF-8', "specified encoding" );
my $text1 = "Hello world \x{263A}\n\n";
my $file1 = "$data\\$encoding.txt";
$fan1->write_file( $file1, $text1 );
my $expected1 = <<HERE;
00000000 EF BB BF 48 65 6C 6C 6F - 20 77 6F 72 6C 64 20 E2 ...Hello
+world .
00000010 98 BA 0A 0A ....
HERE
is( my_hexdump($file1), $expected1, "write_file $file1" );
# reset encoding - should be detected in read_file
$fan1->set_encoding('NONE');
is( $fan1->get_encoding(), 'NONE', "default encoding" );
# read test file and remember encoding
my $text2 = $fan1->read_file($file1);
is( $fan1->get_encoding(), 'UTF-8', "detected encoding" );
my $expected2 = 'Hello world \x{263a}\x{a}\x{a}';
is( my_reasciify($text2), $expected2, "read_file $file1" );
my @lines2 = $fan1->read_file($file1);
is( scalar @lines2, 2, "read_file $file1" );
my $join2 = join( '', @lines2 );
is( my_reasciify($join2), $expected2, "read_file $file1" );
# modify text and write second file using the remebered encoding
$text2 =~ s/world/WORLD/;
$text2 =~ s/\x{263A}/\x{20AC}/;
( my $file2 = $file1 ) =~ s/.txt/-2.txt/;
$fan1->write_file( $file2, $text2 ); # writes file with remembe
+red encoding
my $expected3 = <<HERE;
00000000 EF BB BF 48 65 6C 6C 6F - 20 57 4F 52 4C 44 20 E2 ...Hello
+WORLD .
00000010 82 AC 0A 0A ....
HERE
is( my_hexdump($file2), $expected3, "write_file $file1" );
}
# test no encoding
{
# create object and write test file with specified encoding
my $encoding = 'NONE';
my $fan1 = File::AnyEncoding->new($encoding);
isa_ok( $fan1, 'File::AnyEncoding', '$fan1' );
is( $fan1->get_encoding(), 'NONE', "specified encoding" );
my $text1 = "Hello world \x{263A}\n\n";
my $file1 = "$data\\$encoding.txt";
$fan1->write_file( $file1, $text1 );
my $expected1 = <<HERE;
00000000 48 65 6C 6C 6F 20 77 6F - 72 6C 64 20 E2 98 BA 0A Hello wor
+ld ....
00000010 0A .
HERE
is( my_hexdump($file1), $expected1, "write_file $file1" );
# reset encoding - should be detected in read_file
$fan1->set_encoding('UTF-8');
is( $fan1->get_encoding(), 'UTF-8', "default encoding" );
# read test file and remember encoding
my $text2 = $fan1->read_file($file1);
is( $fan1->get_encoding(), 'NONE', "detected encoding" );
my $expected2 = 'Hello world \x{e2}\x{98}\x{ba}\x{a}\x{a}';
is( my_reasciify($text2), $expected2, "read_file $file1" );
my @lines2 = $fan1->read_file($file1);
is( scalar @lines2, 2, "read_file $file1" );
my $join2 = join( '', @lines2 );
is( my_reasciify($join2), $expected2, "read_file $file1" );
# modify text and write second file using the remebered encoding
$text2 =~ s/world/WORLD/;
$text2 =~ s/\x{263A}/\x{20AC}/;
( my $file2 = $file1 ) =~ s/.txt/-2.txt/;
$fan1->write_file( $file2, $text2 ); # writes file with remembe
+red encoding
my $expected3 = <<HERE;
00000000 48 65 6C 6C 6F 20 57 4F - 52 4C 44 20 E2 98 BA 0A Hello WOR
+LD ....
00000010 0A .
HERE
is( my_hexdump($file2), $expected3, "write_file $file1" );
}
exit 0;
# returns hexdump of the $file
sub my_hexdump {
my $file = shift;
my $f = new Data::HexDump;
unless ( -f $file ) {
warn "no such file $file";
return '---';
}
$f->file($file);
my $str = '';
while ( local $_ = $f->dump ) {
$str .= $_;
}
$str =~ s/.*00000000/00000000/s;
return $str;
}
# returns sprintf of characters in $string,
# replacing those not printable as ascii by their hex code point numer
+ic value
# similar to sub in File::BOM
sub my_reasciify {
my $string = shift;
$string = join "", map {
my $ord = ord($_);
# ($ord > 127 || ($ord < 32 && $ord != 10))
( $ord > 127 || $ord < 32 )
? sprintf '\x{%x}', $ord
: $_
} split //, $string;
}
__END__