I have the following script which stores email to a database. I'm having a problem with message subjects/bodies which contain utf-8 encoded data. The message is split up into parts by Mime::Parser. The parts are stored in /tmp/msg-*.
I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.
My program is able to store the utf-8 message body correctly.
Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.
MAIN PROGRAM
#!/usr/bin/perl
use strict;
use DBI;
use File::Type;
use Date::Parse;
use MIME::Parser;
use Mail::POP3Client;
use Getopt::Std;
use Data::Dumper;
use POSIX qw(strftime);
use Compress::Zlib qw(compress);
binmode(STDOUT, ":utf8");
$|++;
my %opts;
getopts('d', \%opts);
my $dbh = DBI->connect(
'dbi:mysql:mail_archive',
'username',
'password', {
AutoCommit => 1
}
) or die $DBI::errstr;
$dbh->{mysql_enable_utf8} = 1;
$dbh->do("set character set utf8");
$dbh->do("set names utf8");
my %query;
$query{max_pack} = $dbh->prepare(<<EOQ);
set max_allowed_packet=16776192
EOQ
$query{last_id} = $dbh->prepare(<<EOQ);
select last_insert_id()
EOQ
$query{body_md5_match} = $dbh->prepare(<<EOQ);
select body_id from MA_body where check_sum = md5(?) and body = ?
EOQ
$query{attach_md5_match} = $dbh->prepare(<<EOQ);
select attach_id from MA_attach where check_sum = md5(?) and attach =
+?
EOQ
$query{ins_header} = $dbh->prepare(<<EOQ);
insert into MA_hdr (
subject,
msgid,
body_id,
full_header
) values (?,?,?,?)
EOQ
$query{ins_addr} = $dbh->prepare(<<EOQ);
insert into MA_addr (
email_header_id,
hdr_id,
header_type,
email_id
) values (?,?,?,?)
EOQ
$query{lkup_email} = $dbh->prepare(<<EOQ);
select email_id from MA_email where email_addr = ?
EOQ
$query{ins_email} = $dbh->prepare(<<EOQ);
insert ignore into MA_email (email_addr) values (?)
EOQ
$query{ins_body} = $dbh->prepare(<<EOQ);
insert into MA_body (
body,
check_sum
) values (?, md5(?) )
EOQ
$query{ins_attach} = $dbh->prepare(<<EOQ);
insert into MA_attach (
attach_name,
mime_type,
check_sum,
attach
) values (?, ?, md5(?), ?)
EOQ
$query{ins_attach_addr} = $dbh->prepare(<<EOQ);
insert into MA_attach_addr (
hdr_id,
attach_id
) values (?,?)
EOQ
$query{max_pack}->execute();
my $ft = File::Type->new();
POP3_CONNECTION:
my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net");
$pop->User("mail_archive_mirror");
$pop->Pass("PTWH7EJU");
### Loop forever.
while (1) {
if (! $pop->Connect()) {
sleep 2;
goto POP3_CONNECTION;
}
### Now we iterate over each message present on the server.
for (my $num = 1; $num <= $pop->Count(); $num++) {
my $message = $pop->Retrieve($num);
### Cut message into parts.
my $parts = parse_message($message);
### Store parts to sql tables.
create_record($parts);
### Delete the message.
$pop->Delete($num);
}
sleep 1;
}
#################################################################
sub debug {
#################################################################
print @_, "\n" if $opts{d};
}
#################################################################
sub create_record {
#################################################################
my $parts = shift();
my $header_id;
my $body_id;
my $header_cntr = 1;
### Check to see if we have an existing body record.
$query{body_md5_match}->execute($parts->{body}, $parts->{body});
if ($query{body_md5_match}->rows() == 1) {
($body_id) = $query{body_md5_match}->fetchrow_array();
}
if (!$body_id) {
### We need to add a body record, becuase one doesn't exist.
$query{ins_body}->execute(
$parts->{body},
$parts->{body},
);
$query{last_id}->execute();
($body_id) = $query{last_id}->fetchrow_array();
}
### Insert the header record for the message.
$query{ins_header}->execute(
$parts->{subject},
$parts->{'Message-ID'},
$body_id,
$parts->{full_header},
);
$query{last_id}->execute();
($header_id) = $query{last_id}->fetchrow_array();
### Insert addr records for the header parts.
for my $addr qw(from to cc) {
for my $email (@{$parts->{"distinct_$addr"}}) {
$query{ins_email}->execute($email);
$query{lkup_email}->execute($email);
my ($email_id) = $query{lkup_email}->fetchrow_array();
$query{ins_addr}->execute(
$header_cntr++,
$header_id,
$addr,
$email_id,
);
}
}
### Add attachments records.
if (-d $parts->{output_dir}) {
opendir DIR, $parts->{output_dir};
my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r
+eaddir DIR;
closedir DIR;
chdir $parts->{output_dir};
ATTACH_LOOP:
for my $attachment (@attachments) {
open FILE, $attachment;
my $contents = do { local $/; <FILE> };
close FILE;
my $mt = $ft->mime_type($contents);
my $contents_gz = compress($contents, 9);
### We skip large attachments.
$query{attach_md5_match}->execute($contents_gz, $contents_
+gz)
or next ATTACH_LOOP;
my ($attach_id) = $query{attach_md5_match}->fetchrow_array
+();
debug("# attach_id attach_md5_match: $attach_id");
if (!$attach_id) {
$query{ins_attach}->execute(
$attachment,
$mt,
$contents_gz,
$contents_gz,
);
$query{last_id}->execute();
($attach_id) = $query{last_id}->fetchrow_array();
debug("# new attachment inserted into MA_attach: $atta
+ch_id");
}
### Insert attach_addr record.
$query{ins_attach_addr}->execute(
$header_id,
$attach_id,
);
}
}
}
#################################################################
sub parse_message {
#################################################################
my $message = shift;
my $parser = MIME::Parser->new() or return 0;
$parser->ignore_errors(1) or return 0;
$parser->extract_uuencode(1) or return 0;
$parser->output_under('/tmp');
my $entity = $parser->parse_data($message) or die $!;
my $header = $entity->head() or die $!;
my $parts = { map {
my $val = $header->get($_);
chomp $val if $val;
$_ => $val;
} qw(subject date to from cc Message-ID) };
$parts->{full_header} = $header->as_string();
### Remove Trash from mail address fields.
for my $var (qw(from to cc)) {
$parts->{$var} = lc $parts->{$var};
# my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\.
+[a-z0-9]*/g;
my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\.
+[a-z0-9\._-]*/g;
my %seen;
### Get a uniqe list.
for my $elmt (@matches) {
$seen{$elmt}++;
}
@matches = keys %seen;
### Separate values with comma
if (@matches) {
$parts->{$var} = join ',', @matches;
# $parts->{$var} =~ s/.$//;
if ($var eq "from" || $var eq "to" || $var eq "cc") {
$parts->{"distinct_$var"} = \@matches;
}
}
}
### Gives the name of the output directory tree.
$parts->{output_dir} = $parser->output_dir();
### Put body together.
if (-d $parts->{output_dir}) {
$parts->{body} = "";
opendir DIR, $parts->{output_dir};
my @body_parts = grep { /^msg/ } readdir DIR;
sort @body_parts;
closedir DIR;
chdir $parts->{output_dir};
for my $part (@body_parts) {
open FILE, $part;
my $contents = do { local $/; <FILE> };
close FILE;
$parts->{body} .= $contents;
}
}
### $parts->{body} = join "", @{$entity->body()};
debug("# Email parts");
debug(Dumper($parts));
return $parts;
}
__END__