I have two versions of the same perl search script, one ist standalone and the other one is a three-lines perl script, that calls the main function in a perl module. Both versions have the same code. The module version works as supposed and the standalone version crashes with the error "Malformed UTF-8 character" at the line with the regex /romantic/
This must be an internal bug of perl. It is well known that perl had (and probably has) unicode/utf8 issues. I am using Strawberry perl 5.30.0 (built for MSWin32-x64-multi-thread) on windows 10 pro client with recent updates.
Here are the codes and the files that are needed to reproduce this problem.
- The searched files are in http://ftp.freedb.org/pub/freedb/freedb-update-20200201-20200301.tar.bz2
I am extracting this file into C:/MyScripts/freedb-update-20200201-20200301
- The module version of the script C:/MyScripts/search_script_with module.pl is:
BEGIN{push(@INC,'C:/MyScripts');}
use searchFreedb;
mainSearchFreedb('C:/MyScripts/freedb-update-20200201-20200301');
print "End of script\n";
- The corresponding module C:/MyScripts/searchFreedb.pm is
package searchFreedb;
use strict;
use utf8;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(mainSearchFreedb);
@EXPORT_OK = qw(mainSearchFreedb);
$VERSION = 1.0;
$| = 1;
#############################################################
# mainSearchFreedb
#############################################################
sub mainSearchFreedb {
my ($searchdir) = @_;
open(FILE, ">C:/MyScripts/sresult_module.txt") || die "$!\n";
binmode FILE, ":utf8";
recursivSearchFreedb($searchdir);
close(FILE);
}
#############################################################
# recursivSearchFreedb
#############################################################
sub recursivSearchFreedb {
my ($dir) = @_;
die "dir $dir!\n" if(!$dir || !(-e $dir && -d $dir));
$dir =~ s/[\/\\]+/\//og;
$dir = $dir . '/' if( $dir !~ /\/$/o );
my ($dirname) = ( $dir =~ /^.*\/([^\/]+?)\/*$/o );
opendir(DIR,$dir) || warn __LINE__."$!\n";
my @all_dir_files = readdir(DIR);
closedir(DIR);
print "Folder: $dir => $dirname\n";
foreach my $dir_file ( sort @all_dir_files ) {
$dir_file =~ /^\.+$/o && next;
my $abspath = $dir . $dir_file;
if( -d $abspath ) {
recursivSearchFreedb($abspath);
}
else {
if($dir_file =~ /(^COPYING$|^README$)$)/io) {
print "skipping $dir_file\n";
next;
}
elsif(-z $abspath) {
next;
}
my ($content);
open(IN, "<$abspath") || die "$!\n";
while(my $line = <IN>) {
next if not $line =~ /^#\s+xmcd/o;
$content .= $line;
my ($TITLEALL,$DISCID,$GENRE);
for(;;) {
my $line2 = <IN>;
if($line2=~/^\s*DTITLE\s*=(.*)$/o) {$TITLEALL .= $1;}
if($line2=~/^\s*DISCID=\s*(.+?)\s*$/o) {$DISCID = $1;}
if($line2=~/^\s*DGENRE\s*=(.*)$/o) {$GENRE .= $1;}
$content .= $line2;
if($line2 =~ /^PLAYORDER=/o) {
if( $TITLEALL =~ /Romanti[cqk]/io ) {
print FILE "$content\n";
}
last;
}
}
}
close(IN);
}
}
}
##############################################################
# end of package
##############################################################
1;
- The standalone version of the script C:/MyScripts/search_script_standalone.pl is:
use strict;
use utf8;
$| = 1;
#############################################################
# recursivSearchFreedb
#############################################################
sub recursivSearchFreedb {
my ($dir) = @_;
die "dir $dir\n" if(!$dir || !(-e $dir && -d $dir));
$dir =~ s/[\/\\]+/\//og;
$dir = $dir . '/' if( $dir !~ /\/$/o );
my ($dirname) = ( $dir =~ /^.*\/([^\/]+?)\/*$/o );
opendir(DIR,$dir) || warn __LINE__."$!\n";
my @all_dir_files = readdir(DIR);
closedir(DIR);
print "Folder: $dir => $dirname\n";
foreach my $dir_file ( sort @all_dir_files ) {
$dir_file =~ /^\.+$/o && next;
my $abspath = $dir . $dir_file;
if( -d $abspath ) {
recursivSearchFreedb($abspath);
}
else {
if($dir_file =~ /(^COPYING$|^README$)/io) {
print "skipping $dir_file\n";
next;
}
elsif(-z $abspath) {
next;
}
my ($content);
open(IN, "<$abspath") || die "$!\n";
while(my $line = <IN>) {
next if not $line =~ /^#\s+xmcd/o;
$content .= $line;
my ($TITLEALL,$DISCID,$GENRE);
for(;;) {
my $line2 = <IN>;
if($line2=~/^\s*DTITLE\s*=(.*)$/o) {$TITLEALL .= $1;}
if($line2=~/^\s*DISCID=\s*(.+?)\s*$/o) {$DISCID = $1;}
if($line2=~/^\s*DGENRE\s*=(.*)$/o) {$GENRE .= $1;}
$content .= $line2;
if($line2 =~ /^PLAYORDER=/o) {
if( $TITLEALL =~ /Romanti[cqk]/io ) {
print FILE "$content\n";
}
last;
}
}
}
close(IN);
}
}
}
############################################################
# main starts here
############################################################
open(FILE, ">C:/MyScripts/sresult_standalone.txt") || die "$!\n";
binmode FILE, ":utf8";
recursivSearchFreedb('C:/MyScripts/freedb-update-20200201-20200301');
close(FILE);
print "End of script\n";
I am starting the module version with
"perl -CDS search_script_with module.pl"
The result is:
Folder: C:/MyScripts/freedb-20200201-20200301/
Folder: C:/MyScripts/freedb-20200201-20200301/blues/
Folder: C:/MyScripts/freedb-20200201-20200301/classical/
Folder: C:/MyScripts/freedb-20200201-20200301/country/
Folder: C:/MyScripts/freedb-20200201-20200301/data/
Folder: C:/MyScripts/freedb-20200201-20200301/folk/
Folder: C:/MyScripts/freedb-20200201-20200301/jazz/
Folder: C:/MyScripts/freedb-20200201-20200301/misc/
Folder: C:/MyScripts/freedb-20200201-20200301/newage/
Folder: C:/MyScripts/freedb-20200201-20200301/reggae/
Folder: C:/MyScripts/freedb-20200201-20200301/rock/
Folder: C:/MyScripts/freedb-20200201-20200301/soundtrack/
End of script
I am starting the standalone version with
"perl -CDS search_script_standalone.pl"
The result is (it crashes very quickly):
Folder: C:/MyScripts/freedb-20200201-20200301/
Folder: C:/MyScripts/freedb-20200201-20200301/blues/
Malformed UTF-8 character: \xf6\x6e\x20\x26 (unexpected non-continuati
+on byte 0x6e, immediately after start byte 0xf6; need 4 bytes, got 1)
+ in pattern match (m//) at C:\MYSCRI~1\SEARCH~2.PL line 55, <IN> line
+ 67.
Malformed UTF-8 character (fatal) at C:\MYSCRI~1\SEARCH~2.PL line 55,
+<IN> line 67.
Any ideas why the standalone version crashes? Can you reproduce the problem on your own pc? Thank you for your answers or ideas.