I played with the above so much i factored out the array of hashs to make the output file instead after putting it in, So the following is refactored again to construct the AoH, THEN make the output file from it.
#!perl -w
use strict;
use Data::Dumper;
# all fo them
my $aoh=[];
# used in getheaders to add tabs
my $linejoin=' ';
# This will collect all information for one entry:
my $infohash={type=>'top'};
# state machine data
my $record_kind='none';
my $last_record_blank=1;
my %next_record = ( # state table was => now
none => 'page',
page => 'topaddr',
topaddr => 'rundate',
rundate => 'title',
title => 'getheaders',
getheaders => 'getheaders', # hold state till manual reset
+
founddesc => 'permitdate',
permitdate => 'address',
address => 'description',
description => 'permitdate',
);
while(<DATA>) {
chomp;
s!\t! !g; # can have tabs in fields, kill them
if ( m!^\s*$!) { # blanks dont matter
$last_record_blank=1;
next;
}
elsif ($last_record_blank) { # new data so go to next state
$record_kind = $next_record{ $record_kind };
if ($record_kind eq 'permitdate') { # move to permitdate means
+ dump full data
push @$aoh,$infohash;
$infohash={type=>'permit'};
}
$last_record_blank=0;
}
if (defined($infohash->{ $record_kind })) {
$infohash->{ $record_kind } .= $linejoin . $_;
}
else {
$infohash-> { $record_kind } = $_;
}
if ($record_kind eq 'getheaders') {
if ( m!^Description! ) {
$record_kind='founddesc'; # manual move to next state
$linejoin=' ';
}
else {$linejoin="\t"; }
}
};
push @$aoh,$infohash;
print Dumper($aoh);
#########################################################
# make csv
#########################################################
# delimiter style , pick one
my $delim;
# this mode produces comma delimted files, special magic needs to be a
+dded to output
$delim=',';
# this produces tab delimted files, fields just cant contain a tab
# the output needs to have a .txt filetype and be opened with excel
# or another filetype (say .tdf) if that filetype is assigned to be op
+ened by excel
# i like tdf better, but you said csv
# $delim="\t";
for my $hash (@$aoh) {
infoprint ($hash->{type},$hash);
}
infoprint ('bot',@{$aoh}[0]);
sub delimfix{
# excel has problems with text files
# something excell may think as a date but should be text
# 2012-12-12
# 2/4
# something excell may think is a number but is text
# 1e123
# leading zeros as text
# 00001
# leading equals as text
# =x
# (leading equals-dquote, training dquote) fixes these problems (the
+equals function)
# There still exists a long string problem with ="..."
# |001-----0|010-----0|020-----0|030-----0|040-----0|050-----0|060-
+----0|070-----0|080-----0|090-----0|101-----0|110-----0|120-----0|130
+-----0|140-----0|150-----0|160-----0|170-----0|180-----0|190-----0|20
+1-----0|210-----0|220-----0|230-----0|240-----0|250-----0|260-----0|2
+70-----0|280-----0|290-----0
my $field=shift;
if ($field=~m!"!) {
$field=~s!"!""!g;
}
if ($delim eq ',' && $field=~m!,!) {
# csv also has problems with commas in string
# but if it has a comma it doesnt have the other problems
return '"'.$field.'"';
}
return '="'.$field.'"';
};
sub infoprint {
# Output a row of information
my($title,$record) = @_;
# print $title." data\n"; print Dumper $record;
my @delimline;
if (lc($title) eq 'top') {
print delimfix($record->{title})."\n";
print "\n";
for my $f (split("\t",$record->{getheaders})) { push @delimline,
+delimfix($f)}
}
elsif (lc($title) eq 'bot') {
print "------\n";
print delimfix($record->{page})."\n";
print delimfix($record->{topaddr})."\n";
print delimfix($record->{rundate})."\n";
}
else {
if ($record->{permitdate}=~m!^\d+/\d+/\d+!) {
push @delimline,$record->{permitdate};
} else {
push @delimline,delimfix($record->{permitdate});
}
push @delimline,delimfix($record->{address});
push @delimline,delimfix($record->{description});
}
print join($delim,@delimline)."\n";
};
__DATA__
Page 1 of 3
100 Civic Center Way
Calabasas, California 91302
7/12/2012 9:21:02AM "hi"
MONTHLY EXTERNAL MODIFICATIONS PERMITS REPORT
Jun 2012
Permit Issued
Address
Description
06/01/2012
26166 ROYMOR
DR
Upgrade panel from 100 amp to 200 amp
06/04/2012
24956 NORMANS
WAY
(6) light fixtures @ patio; (3) branch circuits; (4) electric
heaters
06/05/2012
4273 VICASA
DR
Construct 339 SF Covered Loggia
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.