Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW
 
PerlMonks  

parsembox

by vxp (Pilgrim)
on Jul 28, 2002 at 21:51 UTC ( [id://185851]=sourcecode: print w/replies, xml ) Need Help??
Category: Text Processing
Author/Contact Info vpolyakov@katrillion.com
Description: This is a little something that parses an mbox file and grabs email address out of it (I use it at work to parse a bounce file and grab email addresses out of it for various purposes). Feel free to modify it, use it, whatever. (Credit info: this was actually not written by me, but by the previous network admin)
#!/usr/bin/perl -w
use strict;

my $file = shift;
my $msg = undef;
my $count = 0;
my $count_match = 0;
my $blank = 1;
my $matched = 0;
my $addr_flag = undef;

open(MBOX, "< $file")
        or die "Couldn't open mbox: !$\n";
open(ADDR_LOG, ">> addr.list")
        or die "Couldn't open list: !$\n";
open(NOMATCH, ">> nomatch.mbox")
        or die "Couldn't open nomatch: !$\n";

sub print_addr
{
        my $addr = shift;
        print "$addr_flag\n" if ($addr eq '1');
        print ADDR_LOG "$addr\n";
        $addr_flag = undef;
        $matched = 1;
        $count_match++;
}

while (<MBOX>)
{
        if ($blank && /\AFrom .*\d{4}/)
        {
                $count++;
                print NOMATCH "$msg" if (!$matched && defined($msg));
                $msg = $_;
                $blank = 0;
                $matched = 0;
        } else {
                $msg .= $_;
                $blank = m#\A\Z#o ? 1 : 0;
                if (!$blank && !$matched)
                {
                        if (!defined($addr_flag))
                        {
                                if (/^\s-+ The following addresses had
+ permanent fatal errors -+$/)
                                {
                                        $addr_flag = "std";
                                } elsif (/not accepting mail with atta
+chments or embedded images:?$/) {
                                        my ($addr) = /Your mail to (.*
+) could not/;
                                        print_addr("$addr\@aol.com");
                                } elsif (/permanent error; I've given 
+up\. Sorry it didn't work out\.$/) {
                                        $addr_flag = "std";
                                } elsif (/undeliverable to the followi
+ng:$/) {
                                        $addr_flag = "postfix";
                                } elsif (/Final-Recipient:/) {
                                        my ($addr) = /822;(.*)/;
                                        if (defined($addr))
                                        {
                                                $addr =~ s/^\s//;
                                                if ($addr =~ /<.*>/)
                                                {
                                                        $_ = $addr;
                                                        ($addr) = /<(.
+*)>/;
                                                }
                                                print_addr($addr);
                                        }
                                } elsif (/Receiver not found:/) {
                                        my ($addr) = /Receiver not fou
+nd:(.*)/;
                                        $addr =~ s/^\s//;
                                        print_addr("$addr\@compuserve.
+com");
                                } elsif (/delete existing messages and
+ then empty their trash/) {
                                        $addr_flag = "std";
                                } elsif (/^was not delivered to:$/) {
                                        $addr_flag = "space";
                                } elsif (/^Your message$/) {
                                        $addr_flag = "to";
                                } elsif (/^recipients\. The following 
+address\(es\) failed:$/) {
                                        $addr_flag = "space";
                                } elsif (/^Delivery to the following r
+ecipients failed\.$/) {
                                        $addr_flag = "space";
                                } elsif (/Here is your List of Failed 
+Recipients/) {
                                        $addr_flag = "std";
                                } elsif (/The user\(s\) account is tem
+porarily over quota/) {
                                        $addr_flag = "std";
                                } elsif (/-+Transcript of session foll
+ows -+/) {
                                        $addr_flag = "space";
                                } elsif (/Reason: Not in authenticatio
+n system/) {
                                        my ($addr) = /to '(.*)'/;
                                        print_addr($addr);
                                } elsif (/Reason: User .* is not found
+ in the cc:Mail Directory/) {
                                        my ($addr) = /User "(.*)"/;
                                        print_addr($addr);
                                } elsif (/^User unknown: /) {
                                        my ($addr) = /^User unknown: (
+.*)/;
                                        print_addr($addr);
                                } elsif (/User mailbox exceeds allowed
+ size/) {
                                        my ($addr) = /allowed size: (.
+*)/;
                                        print_addr($addr);
                                }
                        } else {
                                if ($addr_flag eq "std")
                                {
                                        my ($addr) = /<(.*)>/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "to") {
                                        my ($addr) = /\sTo:\s*(.*)/;
                                        if (defined($addr))
                                        {
                                                print_addr($addr);
                                        } else {
                                                $addr_flag = undef;
                                        }
                                } elsif ($addr_flag eq "postfix") {
                                        my ($addr) = /\s(.*) \(user no
+t found\)/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "space") {
                                        my ($addr) = /\s*(.*):?/;
                                        print_addr($addr);
                                } elsif ($addr_flag eq "wrap-std") {
                                        $addr_flag = "std";
                                } elsif ($addr_flag eq "wrap-to") {
                                        $addr_flag = "to";
                                } elsif ($addr_flag eq "wrap-space") {
                                        $addr_flag = "space";
                                }
                        }
                }
        }
}

print "Total: $count\n";
print "Match: $count_match\n";
print "Miss : " . ($count - $count_match) . "\n";

close(ADDR_LOG);
close(MBOX);
close(NOMATCH);

Edit by dws to add <code> tags

Replies are listed 'Best First'.
Re: parsembox
by ehdonhon (Curate) on Jul 29, 2002 at 00:14 UTC
Re: parsembox
by vxp (Pilgrim) on Jul 28, 2002 at 21:55 UTC
    I'm sorry, it appears as if there is absolutely no indentation or any formatting whatsoever.. however when i click on "edit your code" it is formatted there and _with_ indentation... :-)
      you must write <code> </code> tags around your code

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://185851]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others having an uproarious good time at the Monastery: (4)
As of 2024-04-19 21:21 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found