#!perl -w use strict; use Data::Dumper; # delimiter style , pick one my $delim; # this mode produces comma delimted files, special magic needs to be added to output $delim=','; # this produces tab delimted files, fields just cant contain a tab # the output needs to have a .txt filetype and be opened with excel # or another filetype (say .tdf) if that filetype is assigned to be opened by excel # i like tdf better, but you said csv # $delim="\t"; sub delimfix{ # excel has problems with text files # something excell may think as a date but should be text # 2012-12-12 # 2/4 # something excell may think is a number but is text # 1e123 # leading zeros as text # 00001 # leading equals as text # =x # (leading equals-dquote, training dquote) fixes these problems (the equals function) # There still exists a long string problem with ="..." # |001-----0|010-----0|020-----0|030-----0|040-----0|050-----0|060-----0|070-----0|080-----0|090-----0|101-----0|110-----0|120-----0|130-----0|140-----0|150-----0|160-----0|170-----0|180-----0|190-----0|201-----0|210-----0|220-----0|230-----0|240-----0|250-----0|260-----0|270-----0|280-----0|290-----0 my $field=shift; if ($field=~m!"!) { $field=~s!"!""!g; } if ($delim eq ',' && $field=~m!,!) { # csv also has problems with commas in string # but if it has a comma it doesnt have the other problems return '"'.$field.'"'; } return '="'.$field.'"'; }; sub infoprint { # Output a row of information my($title,$record) = @_; # print $title." data\n"; print Dumper $record; my @delimline; if (lc($title) eq 'top') { print delimfix($record->{title})."\n"; print "\n"; for my $f (split("\t",$record->{getheaders})) { push @delimline,delimfix($f)} } else { if ($record->{permitdate}=~m!^\d+/\d+/\d+!) { push @delimline,$record->{permitdate}; } else { push @delimline,delimfix($record->{permitdate}); } push @delimline,delimfix($record->{address}); push @delimline,delimfix($record->{description}); } print join($delim,@delimline)."\n"; }; # used in getheaders to add tabs my $linejoin=' '; # This will collect all information for one entry: my %info; # what is in %info my $infotype='Top'; my %topinfo; # state machine data my $record_kind='none'; my $last_record_blank=1; my %next_record = ( # state table was => now none => 'page', page => 'topaddr', topaddr => 'rundate', rundate => 'title', title => 'getheaders', getheaders => 'getheaders', # hold state till manual reset founddesc => 'permitdate', permitdate => 'address', address => 'description', description => 'permitdate', ); while() { chomp; s!\t! !g; # can have tabs in fields , kill them if ( m!^\s*$!) { # blanks dont matter $last_record_blank=1; next; } elsif ($last_record_blank) { # new data so go to next state $record_kind = $next_record{ $record_kind }; if ($record_kind eq 'permitdate') { # move to permitdate means dump full data infoprint($infotype,\%info); %info=(); $infotype='Permit'; } $last_record_blank=0; } if (defined($info{ $record_kind })) { $info{ $record_kind } .= $linejoin . $_; } else { $info{ $record_kind } = $_; } if ($record_kind eq 'getheaders') { if ( m!^Description! ) { $record_kind='founddesc'; # manual move to next state $linejoin=' '; %topinfo=%info; } else {$linejoin="\t"; } } }; infoprint('Permit',\%info,); print "------\n"; print delimfix($topinfo{page})."\n"; print delimfix($topinfo{topaddr})."\n"; print delimfix($topinfo{rundate})."\n"; __DATA__ Page 1 of 3 100 Civic Center Way Calabasas, California 91302 7/12/2012 9:21:02AM "hi" MONTHLY EXTERNAL MODIFICATIONS PERMITS REPORT Jun 2012 Permit Issued Address Description 06/01/2012 26166 ROYMOR DR Upgrade panel from 100 amp to 200 amp 06/04/2012 24956 NORMANS WAY (6) light fixtures @ patio; (3) branch circuits; (4) electric heaters 06/05/2012 4273 VICASA DR Construct 339 SF Covered Loggia