RFC: Text::CSV::R

lima1 has asked for the wisdom of the Perl Monks concerning the following question:

Hi,

~~here~~ (Update: Text::CSV::R) you'll find a first implementation of a simple wrapper around Text::CSV that behaves more or less like R's read.table functions. These work really well with most real world CSV files. So you can slurp a not too mean CSV file in just one line of code. Here the read.table documentation.

What's maybe controversial with my implementation is the use of a tied array as object. I know lots of people consider this evil. Basically, the data of the CSV file is stored in an 2D array with the column and rownames attached. The rational behind this is, most of the time you just want the data. Now you don't have to do something like $obj->get_data. Second, it's very close to the R syntax.

But I am very open to criticism!

SYNOPSIS

         #use Text::CSV::R qw(:all);
         use Text::CSV::R qw(read_table colnames rownames);

         my $M = read_table($filename, \%options);

         print join(q{,}, colnames($M));
         print join(q{,}, rownames($M));

         print $M->[0][0];

         for my $row (@{$M}) {
           for my $col (@{$row}) {
               # do someting with $col
           }
         }
[download]

Update: As requested, the main code, without any POD.

package Text::CSV::R;

require 5.005;

use strict;
use warnings;

require Exporter;
use Text::CSV;
use Text::CSV::R::Matrix;
use Carp;
use Scalar::Util qw(reftype);

our @ISA = qw(Exporter);

our %EXPORT_TAGS = (
    'all' => [
        qw(
            read_csv read_csv2 read_table read_delim rownames colnames
            )
    ]
);

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our $VERSION = '0.01';

our $DEFAULT_OPTS = {
    header           => undef,
    skip             => 0,
    nrow             => -1,
    sep_char         => q{,},
    quote_char       => q{"},
    allow_whitespace => 0,
    binary           => 1,
    blank_lines_skip => 1,
};

# A mapping of the R options to the Text:CSV options. If there is no
# Text::CSV equivalent, the same option name is used (R options are
# not passed to Text::CSV).
our $R_OPT_MAP = {
    sep              => 'sep_char',
    quote            => 'quote_char',
    skip             => 'skip',
    nrow             => 'nrow',
    header           => 'header',
    encoding         => 'encoding',
    row_names        => 'row_names',
    strip_white      => 'allow_whitespace',
    blank_lines_skip => 'blank_lines_skip',
};

sub colnames {
    my ( $tied_ref, $values ) = @_;
    my $tied_obj = tied @{$tied_ref};
    if ( defined $values ) { $tied_obj->{colnames} = $values; }
    return $tied_obj->{colnames};
}

sub rownames {
    my ( $tied_ref, $values ) = @_;
    my $tied_obj = tied @{$tied_ref};
    if ( defined $values && reftype $values eq 'ARRAY' ) { 
        if (scalar @{$values} != scalar @{$tied_obj->{ARRAY}}) {
            croak 'Invalid rownames length';
        }
        $tied_obj->{rownames} = $values; 
    }
    return $tied_obj->{rownames};
}

# merge the global default options, function defaults and user options
sub _merge_options {
    my ( $t_opt, $u_opt ) = @_;
    my %ret = %{$DEFAULT_OPTS};
    @ret{ keys %{$t_opt} } = values %{$t_opt};
    @ret{ keys %{$u_opt} } = values %{$u_opt};
    for my $k ( keys %{$R_OPT_MAP} ) {
        if ( defined $ret{$k} ) {
            $ret{ $R_OPT_MAP->{$k} } = $ret{$k};
        }
    }
    return \%ret;
}

sub read_table {
    my ( $file, %u_opt ) = @_;
    return _read( $file, _merge_options( {}, \%u_opt ) );
}

sub read_csv {
    my ( $file, %u_opt ) = @_;
    my $t_opt = { sep_char => q{,}, header => 1, };
    return _read( $file, _merge_options( $t_opt, \%u_opt ) );
}

sub read_csv2 {
    my ( $file, %u_opt ) = @_;
    my $t_opt = { sep_char => q{;}, header => 1, };
    return _read( $file, _merge_options( $t_opt, \%u_opt ) );
}

sub read_delim {
    my ( $file, %u_opt ) = @_;
    my $t_opt = { sep_char => "\t", header => 1, };
    return _read( $file, _merge_options( $t_opt, %u_opt ) );
}

# check if $file is a filehandle, if not open file with correct encodi
+ng.
# Then let _parse_fh do the work.
sub _read {
    my ( $file, $opts ) = @_;
    
    my $data_ref;
  
    if (reftype \$file eq 'SCALAR') {
        my $encoding = q{};
        if (defined $opts->{encoding} && length $opts->{encoding} > 0)
+ {
            $encoding = ':encoding(' . $opts->{encoding} . ')';
        }  
        open my $IN, '<' . $encoding, $file or croak "Cannot open $fil
+e for reading: $!";
        $data_ref = _parse_fh( $IN, $opts );
        close $IN or croak "Cannot close $file: $!";
    } else {
        $data_ref = _parse_fh( $file, $opts );
    }

    return $data_ref;
}

# parsing of the file in a 2d array, store column and row names.
sub _parse_fh {
    my ( $IN, $opts ) = @_;
    my @data;
    my $obj = tie @data, 'Text::CSV::R::Matrix';
    my %text_csv_opts = %{$opts};
    delete @text_csv_opts{ keys %{$R_OPT_MAP} };
    my $csv = Text::CSV->new( \%text_csv_opts )
        or croak q{Cannot use CSV: } . Text::CSV->error_diag();

    # skip lines
    my $line_number = 0;
    while ( $line_number < $opts->{skip} && <$IN> ) {
        $line_number++;
    }
    $line_number = 0;
    my $max_cols = 0;
LINE:
    while ( my $line = <$IN> ) {
        chomp $line;

        # blank_lines_skip option
        if (   !length($line)  && defined $opts->{'blank_lines_skip'}
            && $opts->{'blank_lines_skip'} )
        {
            next LINE;
        }

        my $status = $csv->parse($line)
            or croak q{Cannot parse CSV: } . $csv->error_input();
        push @data, [ $csv->fields() ];
        if ( scalar( @{ $data[-1] } ) > $max_cols ) {
            $max_cols = scalar @{ $data[-1] };
        }
        $line_number++;

        # nrow option. Store one more because file might contain heade
+r.
        last LINE
            if ( defined $opts->{nrow}
            && $opts->{nrow} >= 0
            && $line_number > $opts->{nrow} );
    }
    
    my $auto_col_row = scalar @{$data[0]} == $max_cols - 1 ? 1 : 0;

    # read column names
    if ( $auto_col_row || ( defined $opts->{header} && $opts->{header}
+ ) ) {
        colnames( \@data, shift @data );
    }
    else {
        colnames( \@data, [ map { 'V' . $_ } 1 .. $max_cols ] );
        if ( defined $opts->{nrow} && scalar(@data) > $opts->{nrow} ) 
+{
            pop @data;
        }
    }

    # read row names
    my @rownames;
    if ( $auto_col_row ) {
        for my $row (@data) {
            push @rownames, $row->[0];
            shift @{$row};
        }
    } elsif ( defined $opts->{row_names} && reftype \$opts->{row_names
+} eq 'SCALAR' ) {
        for my $row (@data) {
            push @rownames, $row->[$opts->{row_names}];
            splice @{$row}, $opts->{row_names}, 1;
        }
    } else {
        @rownames = 1 .. scalar @data;
    }
    rownames(\@data, \@rownames);
    return \@data;
}

1;
[download]

Back to Seekers of Perl Wisdom