Here's some code I used to parse addresses. Maybe it will help.
package AddressParser;
use DBI qw(:sql_types);
use DBUtil;
use Utils;
use Log::Log4perl qw(get_logger);
use strict;
use warnings;
my $us_zip_re = qr/^\s*(([\w.]+\W)+)\s*([a-zA-Z]{2})\s*((\d{5})[ -]?(\
+d{4})?)(\d{2})?\s*$/;
my $can_post_code_re = qr/^\s*((\w+\W)+)\s*([a-zA-Z]{2})\s+([a-zA-Z]\d
+[a-zA-Z][ -]?\d[a-zA-Z]\d)\s*$/;
my $foreign_canada_re = qr/^\s*((\w+\W)+)\s*([a-zA-Z]+)\s+CANADA\s+([a
+-zA-Z]\d[a-zA-Z][ -]?\d[a-zA-Z]\d)\s*$/i;
my $foreign_romania_re = qr/^\s*((\w+\W)+)\s*ROMANIA\s+(\d{4,}[ -]?(\d
+{4})?)\s*$/i;
my $foreign_france_re = qr/^\s*((\w+\W)+)\s*FRANCE\s+(\d{4,}[ -]?(\d{4
+})?)\s*$/i;
my $street_re = qr/^\s*([a-zA-Z#]+\s*\d+\s+)?([a-zA-Z]?\d+[a-zA-Z]?)\s
++(\S.*)$/;
my $po_box_re = qr/^\s*((\w+\W+)*\s*[bB][oO][xX])\s*(\d+)/;
#---------------------------------------------------------------------
# Constructor
#---------------------------------------------------------------------
sub new {
my $self = { _state_table => undef };
my ($class, %arg) = @_;
bless $self, $class;
# $self->_init_state_table;
return $self;
}
sub _init_state_table {
my ($self) = @_;
my $dbh;
my $sth;
my $query;
my @row;
my $key;
my $name;
$query = <<QUERY_END
select
abbrev,
name
from
state
QUERY_END
;
$dbh = DBUtil::fetch_connection('shared');
$sth = $dbh->prepare($query);
$sth->execute;
while (@row = $sth->fetchrow_array) {
($key, $name) = @row;
$self->{_state_table}{$key} = $name;
}
$sth->finish;
}
sub insert_zip {
my ($self, $beta_address) = @_;
my $zipcd;
my $line;
my @lines;
($zipcd, @lines) = @$beta_address;
if (!defined($zipcd)) {
$zipcd = '';
}
$zipcd =~ s/^\s+//;
$zipcd =~ s/\s+$//;
for (my $i=0; $i < scalar(@lines); $i++) {
if (defined($lines[$i])) {
$lines[$i] =~ s/ZIPCD/$zipcd/;
}
}
return \@lines;
}
sub find_address_lines {
my ($self, $lines_ref) = @_;
my $line;
my $line_index;
my $street_line_index = -1;
my $zip_line_index = -1;
$line_index = scalar(@$lines_ref);
foreach $line (reverse(@$lines_ref)) {
if (defined($line)) {
# Find the street line, but only if we've already found the zip
+line
if (($zip_line_index > -1) &&
(($line =~ /$street_re/) ||
($line =~ /$po_box_re/))) {
$street_line_index = $line_index - 1;
last;
}
if (($line =~ /$us_zip_re/) ||
($line =~ /$can_post_code_re/) ||
($line =~ /$foreign_canada_re/) ||
($line =~ /$foreign_romania_re/) ||
($line =~ /$foreign_france_re/)) {
$zip_line_index = $line_index - 1;
}
}
$line_index--;
}
if ($zip_line_index == -1) {
get_logger()->error("Couldn't find zip code");
}
if ($street_line_index == -1) {
get_logger()->error("Couldn't find street");
}
if (($zip_line_index == -1) ||
($street_line_index == -1)) {
get_logger()->error("\n\t" . join("\n\t", (map { defined($_) ? $_
+: '' } @$lines_ref)));
return (undef, undef);
}
else {
return (@$lines_ref[$street_line_index], @$lines_ref[$zip_line_ind
+ex]);
}
}
sub parse {
my ($self, $street_line, $city_line) = @_;
my $address = Address->new();
if ($street_line =~ /$street_re/) {
$address->set('street_number', $2);
$address->set('street_name', $3);
}
elsif ($street_line =~ /$po_box_re/) {
$address->set('street_number', $3);
$address->set('street_name', $1);
}
else {
get_logger()->error("Couldn't parse street: $street_line");
return undef;
}
if ($city_line =~ /$us_zip_re/) {
$address->set('city', $1);
$address->set('state', $3);
$address->set('zip', $4);
$address->set('country', 'US');
}
elsif ($city_line =~ /$can_post_code_re/) {
$address->set('city', $1);
$address->set('state', $3);
$address->set('zip', $4);
$address->set('country', 'CA');
}
elsif ($city_line =~ /$foreign_canada_re/) {
$address->set('city', $1);
$address->set('zip', $4);
$address->set('country', 'CA');
}
elsif ($city_line =~ /$foreign_romania_re/) {
$address->set('city', $1);
$address->set('zip', $3);
$address->set('country', 'RO');
}
elsif ($city_line =~ /$foreign_france_re/) {
$address->set('city', $1);
$address->set('zip', $3);
$address->set('country', 'FR');
}
else {
get_logger()->error("Couldn't parse city: $city_line");
return undef;
}
return $address;
}
sub parse_address {
my ($self, $address_lines, $address_type) = @_;
my $street_line;
my $city_line;
if (defined($address_type) && ($address_type eq 'BETA')) {
$address_lines = $self->insert_zip($address_lines);
}
($street_line, $city_line) = $self->find_address_lines($address_line
+s);
if (!defined($street_line) || !defined($city_line)) {
return undef;
}
return $self->parse($street_line, $city_line);
}
1;
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.