Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

Mime::Parser utf-8 issue

by mhearse (Chaplain)
on Jun 11, 2009 at 22:55 UTC ( [id://770782]=perlquestion: print w/replies, xml ) Need Help??

mhearse has asked for the wisdom of the Perl Monks concerning the following question:

I have the following script which stores email to a database. I'm having a problem with message subjects/bodies which contain utf-8 encoded data. The message is split up into parts by Mime::Parser. The parts are stored in /tmp/msg-*.

The utf-8 subjects show up like this in the mysql table: UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06Laj55qE5Li7?==?UTF-8?B?6aKY?=

I am able to manually insert a utf-8 encoded string into the subject column. So I've got the column encoding set correctly.

My program is able to store the utf-8 message body correctly.

Also, the Mime::Parser module doesn't seem to be able to handle gb2312, big5, or gbk character encodings.

Can someone offer guidance?

MAIN PROGRAM #!/usr/bin/perl use strict; use DBI; use File::Type; use Date::Parse; use MIME::Parser; use Mail::POP3Client; use Getopt::Std; use Data::Dumper; use POSIX qw(strftime); use Compress::Zlib qw(compress); binmode(STDOUT, ":utf8"); $|++; my %opts; getopts('d', \%opts); my $dbh = DBI->connect( 'dbi:mysql:mail_archive', 'username', 'password', { AutoCommit => 1 } ) or die $DBI::errstr; $dbh->{mysql_enable_utf8} = 1; $dbh->do("set character set utf8"); $dbh->do("set names utf8"); my %query; $query{max_pack} = $dbh->prepare(<<EOQ); set max_allowed_packet=16776192 EOQ $query{last_id} = $dbh->prepare(<<EOQ); select last_insert_id() EOQ $query{body_md5_match} = $dbh->prepare(<<EOQ); select body_id from MA_body where check_sum = md5(?) and body = ? EOQ $query{attach_md5_match} = $dbh->prepare(<<EOQ); select attach_id from MA_attach where check_sum = md5(?) and attach = +? EOQ $query{ins_header} = $dbh->prepare(<<EOQ); insert into MA_hdr ( subject, msgid, body_id, full_header ) values (?,?,?,?) EOQ $query{ins_addr} = $dbh->prepare(<<EOQ); insert into MA_addr ( email_header_id, hdr_id, header_type, email_id ) values (?,?,?,?) EOQ $query{lkup_email} = $dbh->prepare(<<EOQ); select email_id from MA_email where email_addr = ? EOQ $query{ins_email} = $dbh->prepare(<<EOQ); insert ignore into MA_email (email_addr) values (?) EOQ $query{ins_body} = $dbh->prepare(<<EOQ); insert into MA_body ( body, check_sum ) values (?, md5(?) ) EOQ $query{ins_attach} = $dbh->prepare(<<EOQ); insert into MA_attach ( attach_name, mime_type, check_sum, attach ) values (?, ?, md5(?), ?) EOQ $query{ins_attach_addr} = $dbh->prepare(<<EOQ); insert into MA_attach_addr ( hdr_id, attach_id ) values (?,?) EOQ $query{max_pack}->execute(); my $ft = File::Type->new(); POP3_CONNECTION: my $pop = new Mail::POP3Client(HOST => "mail.tradetech.net"); $pop->User("mail_archive_mirror"); $pop->Pass("PTWH7EJU"); ### Loop forever. while (1) { if (! $pop->Connect()) { sleep 2; goto POP3_CONNECTION; } ### Now we iterate over each message present on the server. for (my $num = 1; $num <= $pop->Count(); $num++) { my $message = $pop->Retrieve($num); ### Cut message into parts. my $parts = parse_message($message); ### Store parts to sql tables. create_record($parts); ### Delete the message. $pop->Delete($num); } sleep 1; } ################################################################# sub debug { ################################################################# print @_, "\n" if $opts{d}; } ################################################################# sub create_record { ################################################################# my $parts = shift(); my $header_id; my $body_id; my $header_cntr = 1; ### Check to see if we have an existing body record. $query{body_md5_match}->execute($parts->{body}, $parts->{body}); if ($query{body_md5_match}->rows() == 1) { ($body_id) = $query{body_md5_match}->fetchrow_array(); } if (!$body_id) { ### We need to add a body record, becuase one doesn't exist. $query{ins_body}->execute( $parts->{body}, $parts->{body}, ); $query{last_id}->execute(); ($body_id) = $query{last_id}->fetchrow_array(); } ### Insert the header record for the message. $query{ins_header}->execute( $parts->{subject}, $parts->{'Message-ID'}, $body_id, $parts->{full_header}, ); $query{last_id}->execute(); ($header_id) = $query{last_id}->fetchrow_array(); ### Insert addr records for the header parts. for my $addr qw(from to cc) { for my $email (@{$parts->{"distinct_$addr"}}) { $query{ins_email}->execute($email); $query{lkup_email}->execute($email); my ($email_id) = $query{lkup_email}->fetchrow_array(); $query{ins_addr}->execute( $header_cntr++, $header_id, $addr, $email_id, ); } } ### Add attachments records. if (-d $parts->{output_dir}) { opendir DIR, $parts->{output_dir}; my @attachments = grep { ! /^\./ && ! /^msg/ && ! /\.txt$/ } r +eaddir DIR; closedir DIR; chdir $parts->{output_dir}; ATTACH_LOOP: for my $attachment (@attachments) { open FILE, $attachment; my $contents = do { local $/; <FILE> }; close FILE; my $mt = $ft->mime_type($contents); my $contents_gz = compress($contents, 9); ### We skip large attachments. $query{attach_md5_match}->execute($contents_gz, $contents_ +gz) or next ATTACH_LOOP; my ($attach_id) = $query{attach_md5_match}->fetchrow_array +(); debug("# attach_id attach_md5_match: $attach_id"); if (!$attach_id) { $query{ins_attach}->execute( $attachment, $mt, $contents_gz, $contents_gz, ); $query{last_id}->execute(); ($attach_id) = $query{last_id}->fetchrow_array(); debug("# new attachment inserted into MA_attach: $atta +ch_id"); } ### Insert attach_addr record. $query{ins_attach_addr}->execute( $header_id, $attach_id, ); } } } ################################################################# sub parse_message { ################################################################# my $message = shift; my $parser = MIME::Parser->new() or return 0; $parser->ignore_errors(1) or return 0; $parser->extract_uuencode(1) or return 0; $parser->output_under('/tmp'); my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $parts = { map { my $val = $header->get($_); chomp $val if $val; $_ => $val; } qw(subject date to from cc Message-ID) }; $parts->{full_header} = $header->as_string(); ### Remove Trash from mail address fields. for my $var (qw(from to cc)) { $parts->{$var} = lc $parts->{$var}; # my @matches = $parts->{$var} =~ /[a-z0-9\._]*@[a-z0-9\._]*\. +[a-z0-9]*/g; my @matches = $parts->{$var} =~ /[a-z0-9\._-]*@[a-z0-9\._-]*\. +[a-z0-9\._-]*/g; my %seen; ### Get a uniqe list. for my $elmt (@matches) { $seen{$elmt}++; } @matches = keys %seen; ### Separate values with comma if (@matches) { $parts->{$var} = join ',', @matches; # $parts->{$var} =~ s/.$//; if ($var eq "from" || $var eq "to" || $var eq "cc") { $parts->{"distinct_$var"} = \@matches; } } } ### Gives the name of the output directory tree. $parts->{output_dir} = $parser->output_dir(); ### Put body together. if (-d $parts->{output_dir}) { $parts->{body} = ""; opendir DIR, $parts->{output_dir}; my @body_parts = grep { /^msg/ } readdir DIR; sort @body_parts; closedir DIR; chdir $parts->{output_dir}; for my $part (@body_parts) { open FILE, $part; my $contents = do { local $/; <FILE> }; close FILE; $parts->{body} .= $contents; } } ### $parts->{body} = join "", @{$entity->body()}; debug("# Email parts"); debug(Dumper($parts)); return $parts; } __END__

Replies are listed 'Best First'.
Re: Mime::Parser utf-8 issue
by blahblahblah (Priest) on Jun 12, 2009 at 03:30 UTC
    I don't think that MIME::Parser touches the encoded headers. You should be able to decode them with something like the following:
    use Encode; my $val = $header->get($_); $val = Encode::decode('MIME-Header', $val); $val = Encode::encode('utf8', $val);
    (I'm sure you could even go further and combine my two lines into one, using Encode::from_to or something like that.)

    Joe

    P.S. Here's an obscure tip that you'll probably never need to worry about: the "Remove Trash..." block of your code should technically come after the decoding that I described above, just in case there is a comma in the encoded data which would be significant to your splitting of the From/To/Cc headers.

      Thanks for you reply. It works great. I have only one small problem. When running the proceeding code, I end up with some trash at the beginning of the subject. Such as: &#1514;\xB7\xA2. Any suggestions?
      my $entity = $parser->parse_data($message) or die $!; my $header = $entity->head() or die $!; my $utf8 = decode('MIME-Header', $header); $header = encode('MIME-Header', $utf8);
        Hmmm... I'm not sure. This code works for me, with the header value from your original post.
        use Encode; my $header = '=?UTF8?B?5LuO5Y2a5a6i5paH56ug5Lit5p+l5om+5oKo5oSf5YW06La +j55qE5Li7?==?UTF-8?B?6aKY?='; my $utf8 = decode('MIME-Header', $header); print "uft8: $utf8\n";
        output is:
        uft8: 从博客文章中查找您感兴趣的主题

        Of course, I can't read any Chinese, so I have no idea if those are the right characters or just gibberish.

Re: Mime::Parser utf-8 issue
by runrig (Abbot) on Jan 22, 2013 at 22:51 UTC
    All you need is:
    my $entity = $parser->parse_data($message); my $header = $entity->head(); my $subject = Encode::decode('MIME-Header', $header->get('subject'));
    I'm not sure why it took nearly 300 lines of code to ask the question...

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: perlquestion [id://770782]
Approved by ww
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others wandering the Monastery: (2)
As of 2024-04-20 04:24 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found