Beefy Boxes and Bandwidth Generously Provided by pair Networks
Clear questions and runnable code
get the best and fastest answer
 
PerlMonks  

ParseCSV

by spacepony (Initiate)
on Dec 15, 2004 at 14:36 UTC ( #415031=sourcecode: print w/replies, xml ) Need Help??
Category: Text Processing
Author/Contact Info Josh Gitlin, joshua dot gitlin at gmail
Description: This is a proposed module that I'd like to submit to CPAN as Spreadsheet::ParseCSV. It works in a similar fashion to SpreadSheet::ParseExcel and is unlike Text::CSV since it takes a file-oriented approach. You instantiate a parser object with a file passed to the constructor and then read it row by row rather than parsing a line at a time which allows for rows that contain line breaks.
=head1 NAME
 
ParseCSV 

=head1 SYNOPSIS

 # This sample script will transpose the first and last column
 # of one CSV file and write it to another

 use Spreadsheet::ParseCSV;
 use Spreadsheet::WriteCSV;
 my $parser = ParseCSV->new("infile.csv");    # initialize the parser
 my $writer = WriteCSV->new("outfile.csv");    # initialize the writer

 while (my $row = $parser->parse_row()) {        # read the next row
   my $firstcol = $row->[0];
   $row->[0] = $row->[$#{$row}];            # swap first and last
   $row->[$#{$row}] = $firstcol;
   $writer->write_row($row);                # write the next row
 }
 $parser->close();                    # close the parser
 $writer->close();                    # close the writer

=head1 DESCRIPTION

 Object-oriented CSV (comma-separated) file parser and writer.

=head2 Definition of a CSV file

 1) Data columns are separated by commas.
 2) Rows are separated by line breaks (see rule 5 also).
 3) Data columns that contain commas are wrapped in double quotes.
 4) Data columns that contain double quotes are wrapped by double quot
+es
 and the quotes within the data are repeated (ie, "The ""Big"" Bomb" =
+= The "Big" Bomb).
 5) Data columns that contain line breaks are wrapped in double quotes
+.

=head2 ParseCSV METHOD SUMMARY

=over

=item new()

Object constructor that takes an argument hash. The hash must have one
+ key - "file" -
that specifies the input file path. Returns a reference to a new parse
+r object on success
or returns 0 if the file is unavailable or unreadable.

=item parse_row()

Method to read the next row of data from the file and returns an array
+ reference with the columns
of data. Returns 0 if there is no next row.

=item close()

Closes the input file.

=back

=head1 AUTHOR

Joshua Gitlin joshua.gitlin@gmail.com

=cut

# Revision history:
# Date        Author        Remarks
# 21-Sep-2004    Joshua Gitlin    Initial version

package ParseCSV;

use vars qw($VERSION);
$VERSION = "1.00";

# public constructor
# Takes and argument hash and expects one key "file"
# which the CSV file to parse. Will open the file for
# reading and return a boolean indicating whether or
# not initializion succeeded.
sub new {
    my ($class, $filename) = @_;
    if (-e $filename && !-r $filename) {
        return 0;
    }
    open CSV, "$filename" || return 0;
    my $csv = \*CSV;
    bless {
        _csv => $csv,
    }, $class;
}

# public method
# Gets the next row of the CSV file and returns the column
# values as an array reference.
sub parse_row {
    my $self = shift;
    my $csv = $self->{_csv};

        my @cols = ();
        my $col = "";

        # flags
        my $in_quoted_string = 0;
        my $eol = 0;

        while (1==1) {
                my $data = "";
        # read next line, return null if nothing is there
                $data = <$csv>;
                return unless ($data);
        $data =~ s/\r\n$/\n/g; # dos2unix
                chomp $data;
        # this will account for empty line breaks in one column
        if ($in_quoted_string && $data eq '') {
            $col .= "\n";
            next;
        }
        # Reverse the string so we can chop it one char at a time
                $data = reverse $data;
        return \@cols if (!$in_quoted_string && shortcut(\$data, \@col
+s));
                while (length($data) > 0) {
            my $next_char = chop $data;
                        if ($next_char eq '"' && !$in_quoted_string) {
                # beginning of a quoted cell
                                $in_quoted_string = 1;
                        } elsif ($next_char eq '"' && $in_quoted_strin
+g) {
                # encountered another double-quote, read next
                # char to determine what it means
                                my $next_next = chop $data;
                                if ($next_next && $next_next eq '"') {
                    # back-to-back quotes means the char is a quote
                                        $col .= '"';
                                } elsif (!$next_next || $next_next eq 
+',') {
                    # If we see a comma or EOL, the cell is closed
                                        $eol = 1 if (!$next_next);
                                        $in_quoted_string = 0;
                                        push @cols, $col;
                    return \@cols if (shortcut(\$data, \@cols));
                                        $col = "";
                                } else {
                    # This shouldn't occur in a well-formed CSV
                                        print "quote string error [$ne
+xt_next]\n";
                    return;
                                }
                        } elsif ($next_char eq ',' && !$in_quoted_stri
+ng) {
                # end of a non-quoted cell
                                push @cols, $col;
                return \@cols if (shortcut(\$data, \@cols));
                                $col = "";
                        } else {
                # just another character
                                $col .= $next_char;
                        }
                }
                if (!$in_quoted_string) {
            # end of line, end or row
                        if (!$eol) {
                                push @cols, $col;
                        } else {
                                $eol = 0;
                        }
                        return \@cols;
                } else {
            # This means we have reached the end of line,
            # but are still waiting for a close quote. Must
            # continue to the next line.
                        $col .= "\n";
                }
        }
}

# public method
# Optional call to close the CSV file. Returns a boolean
# indicating success or failure of close.
sub close {
    my $self = shift;
    my $csv = $self->{_csv};
    return close $csv;
}

# private method
# Subroutine that speeds up parsing if
# no more special columns (see rules 3-5) exist.
sub shortcut {
    # if there are no more quoted strings, we can
    # short circuit the parsing and just split() the rest
    my $dataref = shift;
    my $colsref = shift;
    if ($$dataref !~ /\"/g) {
        push @{$colsref}, split(",",reverse($$dataref),-1);
        return 1;
    }
    return 0;
}

1;
Replies are listed 'Best First'.
Re: ParseCSV
by jZed (Prior) on Dec 15, 2004 at 16:46 UTC
Re: ParseCSV
by dragonchild (Archbishop) on Dec 15, 2004 at 14:48 UTC
    How does your code differ from Text::xSV? Would it be possible to write your module based on Text::xSV, possible subclassing it?

    Being right, does not endow the right to be rude; politeness costs nothing.
    Being unknowing, is not the same as being stupid.
    Expressing a contrary opinion, whether to the individual or the group, is more often a sign of deeper thought than of cantankerous belligerence.
    Do not mistake your goals as the only goals; your opinion as the only opinion; your confidence as correctness. Saying you know better is not the same as explaining you know better.

      Good call. I don't think this module was in CPAN when I started mine. It looks the author of Text::xSV found the same problem with Text::CSV and took a pretty similar approach. I think that if all you ever want to do in your life is parse CSV files (I could extend it to any character I suppose), that mine is a lot simpler. xSV looks pretty sophisticated and if it beats mine for performance (seems likely) then I will give up :(

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://415031]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others scrutinizing the Monastery: (4)
As of 2020-10-21 02:18 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    My favourite web site is:












    Results (212 votes). Check out past polls.

    Notices?