Yea, our geocoding process involves human review.
Properties that fail to geocode are reviewed and most often the
address is dirty and needs human cleaning.
Some geocoding engines assign what I view as
a geo-coding confidence level. Properties that
fall below a certain threshold are reviewed also.
The candidate lists come from sql statements generated by perl code. Just for kicks, here's an example of the sql (that doesn't use a lat/long filter):
insert into match_candidates (row1_id,row2_id,strat_name,status)
select t1.row_id,
t2.row_id,
'name_address_city_zip_county_units_weak_improved_trim_v5',
case
when t1.shim_id = t2.shim_id then 'good'
else 'bad'
end
from merge_list t1, merge_list t2
where t1.row_id < t2.row_id
-- looking for matches from the AHI-Doug source to some
-- other source
and (
(t1.source = 'AHI-Doug' or t2.source = 'AHI-Doug') and (t1.sourc
+e != t2.source)
)
-- no duplicate matches from known phase things
-- this can be weakened to show no previous
-- matches from _any_ stratagy
and ( not exists (
select *
from match_candidates m
where t1.row_id = m.row1_id
and t2.row_id = m.row2_id
)
)
-- strategy layers start here (assume at least one)
AND
-- similar name (weak)
t1.name is not null
and t2.name is not null
and ( regexp_replace(lower(t1.name),'[^a-z ]+','','g') ~ regexp_repla
+ce(lower(t2.name),'[^a-z ]+','','g')
or
regexp_replace(lower(t2.name),'[^a-z ]+','','g') ~ regexp_repla
+ce(lower(t1.name),'[^a-z ]+','','g')
)
AND
-- similar address (weak)
t1.address is not null
and t2.address is not null
and ( regexp_replace(lower(t1.address),'[^a-z ]+','','g') ~ regexp_re
+place(lower(t2.address),'[^a-z ]+','','g')
or
regexp_replace(lower(t2.address),'[^a-z ]+','','g') ~ regexp_re
+place(lower(t1.address),'[^a-z ]+','','g')
)
AND
-- similar city (weak)
t1.city is not null
and t2.city is not null
and ( regexp_replace(lower(t1.city),'[^a-z ]+','','g') ~ regexp_repla
+ce(lower(t2.city),'[^a-z ]+','','g')
or
regexp_replace(lower(t2.city),'[^a-z ]+','','g') ~ regexp_repla
+ce(lower(t1.city),'[^a-z ]+','','g')
)
|