=head1 NAME ParseCSV =head1 SYNOPSIS # This sample script will transpose the first and last column # of one CSV file and write it to another use Spreadsheet::ParseCSV; use Spreadsheet::WriteCSV; my $parser = ParseCSV->new("infile.csv"); # initialize the parser my $writer = WriteCSV->new("outfile.csv"); # initialize the writer while (my $row = $parser->parse_row()) { # read the next row my $firstcol = $row->[0]; $row->[0] = $row->[$#{$row}]; # swap first and last $row->[$#{$row}] = $firstcol; $writer->write_row($row); # write the next row } $parser->close(); # close the parser $writer->close(); # close the writer =head1 DESCRIPTION Object-oriented CSV (comma-separated) file parser and writer. =head2 Definition of a CSV file 1) Data columns are separated by commas. 2) Rows are separated by line breaks (see rule 5 also). 3) Data columns that contain commas are wrapped in double quotes. 4) Data columns that contain double quotes are wrapped by double quotes and the quotes within the data are repeated (ie, "The ""Big"" Bomb" == The "Big" Bomb). 5) Data columns that contain line breaks are wrapped in double quotes. =head2 ParseCSV METHOD SUMMARY =over =item new() Object constructor that takes an argument hash. The hash must have one key - "file" - that specifies the input file path. Returns a reference to a new parser object on success or returns 0 if the file is unavailable or unreadable. =item parse_row() Method to read the next row of data from the file and returns an array reference with the columns of data. Returns 0 if there is no next row. =item close() Closes the input file. =back =head1 AUTHOR Joshua Gitlin joshua.gitlin@gmail.com =cut # Revision history: # Date Author Remarks # 21-Sep-2004 Joshua Gitlin Initial version package ParseCSV; use vars qw($VERSION); $VERSION = "1.00"; # public constructor # Takes and argument hash and expects one key "file" # which the CSV file to parse. Will open the file for # reading and return a boolean indicating whether or # not initializion succeeded. sub new { my ($class, $filename) = @_; if (-e $filename && !-r $filename) { return 0; } open CSV, "$filename" || return 0; my $csv = \*CSV; bless { _csv => $csv, }, $class; } # public method # Gets the next row of the CSV file and returns the column # values as an array reference. sub parse_row { my $self = shift; my $csv = $self->{_csv}; my @cols = (); my $col = ""; # flags my $in_quoted_string = 0; my $eol = 0; while (1==1) { my $data = ""; # read next line, return null if nothing is there $data = <$csv>; return unless ($data); $data =~ s/\r\n$/\n/g; # dos2unix chomp $data; # this will account for empty line breaks in one column if ($in_quoted_string && $data eq '') { $col .= "\n"; next; } # Reverse the string so we can chop it one char at a time $data = reverse $data; return \@cols if (!$in_quoted_string && shortcut(\$data, \@cols)); while (length($data) > 0) { my $next_char = chop $data; if ($next_char eq '"' && !$in_quoted_string) { # beginning of a quoted cell $in_quoted_string = 1; } elsif ($next_char eq '"' && $in_quoted_string) { # encountered another double-quote, read next # char to determine what it means my $next_next = chop $data; if ($next_next && $next_next eq '"') { # back-to-back quotes means the char is a quote $col .= '"'; } elsif (!$next_next || $next_next eq ',') { # If we see a comma or EOL, the cell is closed $eol = 1 if (!$next_next); $in_quoted_string = 0; push @cols, $col; return \@cols if (shortcut(\$data, \@cols)); $col = ""; } else { # This shouldn't occur in a well-formed CSV print "quote string error [$next_next]\n"; return; } } elsif ($next_char eq ',' && !$in_quoted_string) { # end of a non-quoted cell push @cols, $col; return \@cols if (shortcut(\$data, \@cols)); $col = ""; } else { # just another character $col .= $next_char; } } if (!$in_quoted_string) { # end of line, end or row if (!$eol) { push @cols, $col; } else { $eol = 0; } return \@cols; } else { # This means we have reached the end of line, # but are still waiting for a close quote. Must # continue to the next line. $col .= "\n"; } } } # public method # Optional call to close the CSV file. Returns a boolean # indicating success or failure of close. sub close { my $self = shift; my $csv = $self->{_csv}; return close $csv; } # private method # Subroutine that speeds up parsing if # no more special columns (see rules 3-5) exist. sub shortcut { # if there are no more quoted strings, we can # short circuit the parsing and just split() the rest my $dataref = shift; my $colsref = shift; if ($$dataref !~ /\"/g) { push @{$colsref}, split(",",reverse($$dataref),-1); return 1; } return 0; } 1;