$ cat utf8-and-html-entities.pl
#!/usr/angebote/perlroot/bin/perl
use strict;
use warnings;
# use strict;
# use IO::File;
# use Text::CSV_XS;
# use DBI;
# use Time::Local;
# use Time::HiRes;
# use Compress::Zlib;
# use LWP::UserAgent;
#use POSIX qw(locale_h);
use HTML::Strip;
use Test::More qw(no_plan);
use Data::Dumper;
#setlocale(LC_CTYPE, "de_DE.ISO8859-1");
require "../../perl/agentFunc.pl";
my $stringsBeforeAfter = [
[ 'blah', 'blah' ],
[ 'Ü --', 'Ü --'],
["blah -- ’ -- blah", "blah -- -- blah"],
["Ü -- ’ -- blah", "Ü -- -- blah"],
];
foreach my $beforeAfter ( @$stringsBeforeAfter ) {
my ( $before, $after ) = @$beforeAfter;
my $transformed =HTML2Text( stripUtf8Entities( $before ) );
my $strings = [ [ "before", $before ],
[ "after", $after ],
[ "transformed", $transformed ]
];
#print "strings: " . Dumper($strings);
is($transformed, $after, "stripUtf8Entities");
}
foreach my $beforeAfter ( @$stringsBeforeAfter ) {
my ( $before, $after ) = @$beforeAfter;
my $transformed =HTML2Text( stripUtf8EntitiesBetter( $before ) );
my $strings = [ [ "before", $before ],
[ "after", $after ],
[ "transformed", $transformed ]
];
#print "strings: " . Dumper($strings);
is($transformed, $after, "stripUtf8EntitiesBetter");
}
sub HTML2Text {
my ($changeText) = @_;
my $htmlStripObject = HTML::Strip->new();
$changeText = $htmlStripObject->parse($changeText);
return $changeText;
}
# works, but only for one special character: &rsquo
# what happens when I hit another char that doesn't translate well out of utf8?
sub stripUtf8Entities {
my $string = shift || "";
my $utf8Entities = ["’"];
foreach my $utf8Entity ( @$utf8Entities ) {
$string =~ s/$utf8Entity//g;
}
return $string;
}
#just a stub -- is there a better, more general way to do this?
sub stripUtf8EntitiesBetter {
my $string = shift || "";
return $string;
}