http://qs321.pair.com?node_id=1183896


in reply to creating array of hashes from input file

Corion made good points, but it needs a flush(\%info); at the end.

Here is a more non form dependent single state driven method. I like state machines too! It also fixes some common excel problems noted.

#!perl -w use strict; use Data::Dumper; # delimiter style , pick one my $delim; # this mode produces comma delimted files, special magic needs to be a +dded to output $delim=','; # this produces tab delimted files, fields just cant contain a tab # the output needs to have a .txt filetype and be opened with excel # or another filetype (say .tdf) if that filetype is assigned to be op +ened by excel # i like tdf better, but you said csv # $delim="\t"; sub delimfix{ # excel has problems with text files # something excell may think as a date but should be text # 2012-12-12 # 2/4 # something excell may think is a number but is text # 1e123 # leading zeros as text # 00001 # leading equals as text # =x # (leading equals-dquote, training dquote) fixes these problems (the +equals function) # There still exists a long string problem with ="..." # |001-----0|010-----0|020-----0|030-----0|040-----0|050-----0|060- +----0|070-----0|080-----0|090-----0|101-----0|110-----0|120-----0|130 +-----0|140-----0|150-----0|160-----0|170-----0|180-----0|190-----0|20 +1-----0|210-----0|220-----0|230-----0|240-----0|250-----0|260-----0|2 +70-----0|280-----0|290-----0 my $field=shift; if ($field=~m!"!) { $field=~s!"!""!g; } if ($delim eq ',' && $field=~m!,!) { # csv also has problems with commas in string # but if it has a comma it doesnt have the other problems return '"'.$field.'"'; } return '="'.$field.'"'; }; sub infoprint { # Output a row of information my($title,$record) = @_; # print $title." data\n"; print Dumper $record; my @delimline; if (lc($title) eq 'top') { print delimfix($record->{title})."\n"; print "\n"; for my $f (split("\t",$record->{getheaders})) { push @delimline, +delimfix($f)} } else { if ($record->{permitdate}=~m!^\d+/\d+/\d+!) { push @delimline,$record->{permitdate}; } else { push @delimline,delimfix($record->{permitdate}); } push @delimline,delimfix($record->{address}); push @delimline,delimfix($record->{description}); } print join($delim,@delimline)."\n"; }; # used in getheaders to add tabs my $linejoin=' '; # This will collect all information for one entry: my %info; # what is in %info my $infotype='Top'; my %topinfo; # state machine data my $record_kind='none'; my $last_record_blank=1; my %next_record = ( # state table was => now none => 'page', page => 'topaddr', topaddr => 'rundate', rundate => 'title', title => 'getheaders', getheaders => 'getheaders', # hold state till manual reset + founddesc => 'permitdate', permitdate => 'address', address => 'description', description => 'permitdate', ); while(<DATA>) { chomp; s!\t! !g; # can have tabs in fields , kill them if ( m!^\s*$!) { # blanks dont matter $last_record_blank=1; next; } elsif ($last_record_blank) { # new data so go to next state $record_kind = $next_record{ $record_kind }; if ($record_kind eq 'permitdate') { # move to permitdate means + dump full data infoprint($infotype,\%info); %info=(); $infotype='Permit'; } $last_record_blank=0; } if (defined($info{ $record_kind })) { $info{ $record_kind } .= $linejoin . $_; } else { $info{ $record_kind } = $_; } if ($record_kind eq 'getheaders') { if ( m!^Description! ) { $record_kind='founddesc'; # manual move to next state $linejoin=' '; %topinfo=%info; } else {$linejoin="\t"; } } }; infoprint('Permit',\%info,); print "------\n"; print delimfix($topinfo{page})."\n"; print delimfix($topinfo{topaddr})."\n"; print delimfix($topinfo{rundate})."\n"; __DATA__ Page 1 of 3 100 Civic Center Way Calabasas, California 91302 7/12/2012 9:21:02AM "hi" MONTHLY EXTERNAL MODIFICATIONS PERMITS REPORT Jun 2012 Permit Issued Address Description 06/01/2012 26166 ROYMOR DR Upgrade panel from 100 amp to 200 amp 06/04/2012 24956 NORMANS WAY (6) light fixtures @ patio; (3) branch circuits; (4) electric heaters 06/05/2012 4273 VICASA DR Construct 339 SF Covered Loggia

Replies are listed 'Best First'.
Re^2: creating array of hashes from input file
by huck (Prior) on Mar 08, 2017 at 02:33 UTC

    I played with the above so much i factored out the array of hashs to make the output file instead after putting it in, So the following is refactored again to construct the AoH, THEN make the output file from it.

    #!perl -w use strict; use Data::Dumper; # all fo them my $aoh=[]; # used in getheaders to add tabs my $linejoin=' '; # This will collect all information for one entry: my $infohash={type=>'top'}; # state machine data my $record_kind='none'; my $last_record_blank=1; my %next_record = ( # state table was => now none => 'page', page => 'topaddr', topaddr => 'rundate', rundate => 'title', title => 'getheaders', getheaders => 'getheaders', # hold state till manual reset + founddesc => 'permitdate', permitdate => 'address', address => 'description', description => 'permitdate', ); while(<DATA>) { chomp; s!\t! !g; # can have tabs in fields, kill them if ( m!^\s*$!) { # blanks dont matter $last_record_blank=1; next; } elsif ($last_record_blank) { # new data so go to next state $record_kind = $next_record{ $record_kind }; if ($record_kind eq 'permitdate') { # move to permitdate means + dump full data push @$aoh,$infohash; $infohash={type=>'permit'}; } $last_record_blank=0; } if (defined($infohash->{ $record_kind })) { $infohash->{ $record_kind } .= $linejoin . $_; } else { $infohash-> { $record_kind } = $_; } if ($record_kind eq 'getheaders') { if ( m!^Description! ) { $record_kind='founddesc'; # manual move to next state $linejoin=' '; } else {$linejoin="\t"; } } }; push @$aoh,$infohash; print Dumper($aoh); ######################################################### # make csv ######################################################### # delimiter style , pick one my $delim; # this mode produces comma delimted files, special magic needs to be a +dded to output $delim=','; # this produces tab delimted files, fields just cant contain a tab # the output needs to have a .txt filetype and be opened with excel # or another filetype (say .tdf) if that filetype is assigned to be op +ened by excel # i like tdf better, but you said csv # $delim="\t"; for my $hash (@$aoh) { infoprint ($hash->{type},$hash); } infoprint ('bot',@{$aoh}[0]); sub delimfix{ # excel has problems with text files # something excell may think as a date but should be text # 2012-12-12 # 2/4 # something excell may think is a number but is text # 1e123 # leading zeros as text # 00001 # leading equals as text # =x # (leading equals-dquote, training dquote) fixes these problems (the +equals function) # There still exists a long string problem with ="..." # |001-----0|010-----0|020-----0|030-----0|040-----0|050-----0|060- +----0|070-----0|080-----0|090-----0|101-----0|110-----0|120-----0|130 +-----0|140-----0|150-----0|160-----0|170-----0|180-----0|190-----0|20 +1-----0|210-----0|220-----0|230-----0|240-----0|250-----0|260-----0|2 +70-----0|280-----0|290-----0 my $field=shift; if ($field=~m!"!) { $field=~s!"!""!g; } if ($delim eq ',' && $field=~m!,!) { # csv also has problems with commas in string # but if it has a comma it doesnt have the other problems return '"'.$field.'"'; } return '="'.$field.'"'; }; sub infoprint { # Output a row of information my($title,$record) = @_; # print $title." data\n"; print Dumper $record; my @delimline; if (lc($title) eq 'top') { print delimfix($record->{title})."\n"; print "\n"; for my $f (split("\t",$record->{getheaders})) { push @delimline, +delimfix($f)} } elsif (lc($title) eq 'bot') { print "------\n"; print delimfix($record->{page})."\n"; print delimfix($record->{topaddr})."\n"; print delimfix($record->{rundate})."\n"; } else { if ($record->{permitdate}=~m!^\d+/\d+/\d+!) { push @delimline,$record->{permitdate}; } else { push @delimline,delimfix($record->{permitdate}); } push @delimline,delimfix($record->{address}); push @delimline,delimfix($record->{description}); } print join($delim,@delimline)."\n"; }; __DATA__ Page 1 of 3 100 Civic Center Way Calabasas, California 91302 7/12/2012 9:21:02AM "hi" MONTHLY EXTERNAL MODIFICATIONS PERMITS REPORT Jun 2012 Permit Issued Address Description 06/01/2012 26166 ROYMOR DR Upgrade panel from 100 amp to 200 amp 06/04/2012 24956 NORMANS WAY (6) light fixtures @ patio; (3) branch circuits; (4) electric heaters 06/05/2012 4273 VICASA DR Construct 339 SF Covered Loggia