Corion made good points, but it needs a flush(\%info); at the end.
Here is a more non form dependent single state driven method. I like state machines too!
It also fixes some common excel problems noted.
#!perl -w
use strict;
use Data::Dumper;
# delimiter style , pick one
my $delim;
# this mode produces comma delimted files, special magic needs to be a
+dded to output
$delim=',';
# this produces tab delimted files, fields just cant contain a tab
# the output needs to have a .txt filetype and be opened with excel
# or another filetype (say .tdf) if that filetype is assigned to be op
+ened by excel
# i like tdf better, but you said csv
# $delim="\t";
sub delimfix{
# excel has problems with text files
# something excell may think as a date but should be text
# 2012-12-12
# 2/4
# something excell may think is a number but is text
# 1e123
# leading zeros as text
# 00001
# leading equals as text
# =x
# (leading equals-dquote, training dquote) fixes these problems (the
+equals function)
# There still exists a long string problem with ="..."
# |001-----0|010-----0|020-----0|030-----0|040-----0|050-----0|060-
+----0|070-----0|080-----0|090-----0|101-----0|110-----0|120-----0|130
+-----0|140-----0|150-----0|160-----0|170-----0|180-----0|190-----0|20
+1-----0|210-----0|220-----0|230-----0|240-----0|250-----0|260-----0|2
+70-----0|280-----0|290-----0
my $field=shift;
if ($field=~m!"!) {
$field=~s!"!""!g;
}
if ($delim eq ',' && $field=~m!,!) {
# csv also has problems with commas in string
# but if it has a comma it doesnt have the other problems
return '"'.$field.'"';
}
return '="'.$field.'"';
};
sub infoprint {
# Output a row of information
my($title,$record) = @_;
# print $title." data\n"; print Dumper $record;
my @delimline;
if (lc($title) eq 'top') {
print delimfix($record->{title})."\n";
print "\n";
for my $f (split("\t",$record->{getheaders})) { push @delimline,
+delimfix($f)}
}
else {
if ($record->{permitdate}=~m!^\d+/\d+/\d+!) {
push @delimline,$record->{permitdate};
} else {
push @delimline,delimfix($record->{permitdate});
}
push @delimline,delimfix($record->{address});
push @delimline,delimfix($record->{description});
}
print join($delim,@delimline)."\n";
};
# used in getheaders to add tabs
my $linejoin=' ';
# This will collect all information for one entry:
my %info;
# what is in %info
my $infotype='Top';
my %topinfo;
# state machine data
my $record_kind='none';
my $last_record_blank=1;
my %next_record = ( # state table was => now
none => 'page',
page => 'topaddr',
topaddr => 'rundate',
rundate => 'title',
title => 'getheaders',
getheaders => 'getheaders', # hold state till manual reset
+
founddesc => 'permitdate',
permitdate => 'address',
address => 'description',
description => 'permitdate',
);
while(<DATA>) {
chomp;
s!\t! !g; # can have tabs in fields , kill them
if ( m!^\s*$!) { # blanks dont matter
$last_record_blank=1;
next;
}
elsif ($last_record_blank) { # new data so go to next state
$record_kind = $next_record{ $record_kind };
if ($record_kind eq 'permitdate') { # move to permitdate means
+ dump full data
infoprint($infotype,\%info);
%info=();
$infotype='Permit';
}
$last_record_blank=0;
}
if (defined($info{ $record_kind })) {
$info{ $record_kind } .= $linejoin . $_;
}
else {
$info{ $record_kind } = $_;
}
if ($record_kind eq 'getheaders') {
if ( m!^Description! ) {
$record_kind='founddesc'; # manual move to next state
$linejoin=' ';
%topinfo=%info;
}
else {$linejoin="\t"; }
}
};
infoprint('Permit',\%info,);
print "------\n";
print delimfix($topinfo{page})."\n";
print delimfix($topinfo{topaddr})."\n";
print delimfix($topinfo{rundate})."\n";
__DATA__
Page 1 of 3
100 Civic Center Way
Calabasas, California 91302
7/12/2012 9:21:02AM "hi"
MONTHLY EXTERNAL MODIFICATIONS PERMITS REPORT
Jun 2012
Permit Issued
Address
Description
06/01/2012
26166 ROYMOR
DR
Upgrade panel from 100 amp to 200 amp
06/04/2012
24956 NORMANS
WAY
(6) light fixtures @ patio; (3) branch circuits; (4) electric
heaters
06/05/2012
4273 VICASA
DR
Construct 339 SF Covered Loggia
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.