Beefy Boxes and Bandwidth Generously Provided by pair Networks
We don't bite newbies here... much
 
PerlMonks  

Fixup tabular text file

by diotalevi (Canon)
on Sep 14, 2006 at 22:37 UTC ( [id://573022]=sourcecode: print w/replies, xml ) Need Help??
Category:
Author/Contact Info
Description:

This does several things to fixup tabular text files to be more regular including:

  • Removes empty trailing rows
  • Adds missing trailing columns
  • Warns if the header is missing columns
  • Warns if the file isn't a text file
  • Warns if there is only one column.
  • Dos2unix line ending conversion
    • Fixes Excel formatted numbers:
    • (...) parens around negative numbers
    • Optional $ sign
    • Optional commas
#!/usr/bin/perl
use strict;
use warnings FATAL => 'all';
no warnings 'uninitialized';
use Getopt::Long 'GetOptions';
use English '$PROGRAM_NAME';
use autouse 'Pod::Usage' => 'pod2usage';
our %CHANGES;

our $VERSION = ~0;

=head1 NAME

fix-tabs - Fixes some common problems in tab-delimited files

=head1 DESCRIPTION

This fixes some problems I encountered with tabular text files.

=over

=item * Removes empty trailing rows

=item * Adds missing trailing columns

=item * Warns if the header is missing columns

=item * Warns if the file isn't a text file

=item * Warns if there is only one column.

=item * Dos2unix line ending conversion

=item * Fixes Excel formatted numbers:

=over

=item * (...) parens around negative numbers

=item * Optional $ sign

=item * Optional commas

=back

=back

=head1 SYNOPSIS

fix-tabs [options] file1 file2 ...

  Options:
    --help Displays this message
    --man  Displays the manual
    --clip Removes things off the right edge of the table

=cut

GetOptions(
    help => sub { pod2usage( -verbose => 1 ) },
    man  => sub { pod2usage( -verbose => 2 ) },
    clip => \our ($CLIP_EDGES),
    )
    or pod2usage( -verbose => 0 );
if ( not scalar @ARGV ) {
    pod2usage( -verbose => 0 );
}

for my $file (@ARGV) {
    fix_file($file);
}
exit;

# To regenerate the regex on the following line, run this
# command. It'll be uglier than what's below but that's because I made
# the one below prettier. It's still equivalent and is the source for
# the below.
#
# perl -MRegexp::Common -le 'print qr/(?:$RE{num}{real}|$RE{num}{int}|
+$RE{num}{real}{-sep=>','}{-group=>3}|$RE{num}{int}{-sep=>','}{-group=
+>3})/'

my $NUMBER;

BEGIN {
    $NUMBER
        = qr/(?x-ism:(?:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])
(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])
(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)(?:[0123456789]+))|
(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]{1,3}
(?:(?:[,])[0123456789]{3})*)(?:(?:[.])(?:[0123456789]{0,}))?)
(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)
(?:[0123456789]{1,3}(?:,[0123456789]{3})*))))/x;
}

my $EXCEL_NUMBER;

BEGIN {
    $EXCEL_NUMBER = qr/(?xsm)
(?:
    \( \$? $NUMBER \)
  |
    \$? $NUMBER
)/;
}

sub fix_file {
    my ($file) = @_;

    -T $file or die "$file isn't a text file.\n";

    # Read the header line and get the # of columns I expect every
    # other line to have.
    open my $fh, '<', $file or die "Can't open $file: $!";
    my $header_line = <$fh>;

    $header_line =~ m/\t/mx
        or die "$file isn't a tab delimited text file.\n";

    my %field_names;

    {
        my @labels = split /\t/, $header_line;
        chomp $labels[-1];

        for my $index ( 0 .. $#labels ) {
            my $column_name = $labels[$index];
            if ( $column_name =~ s/\A\s+// ) {
                ++$CHANGES{header_whitespace};
            }
            if ( $column_name =~ s/\s+\z// ) {
                ++$CHANGES{header_whitespace};
            }

            $field_names{$column_name} = $index;
        }
    }
    my $expected_columns = scalar keys %field_names;

    # The input file will be copied here. This shouldn't be left
    # around after the program is finished.
    my $tmpfile = "$file.tmp";
    open my $out, '>', $tmpfile
        or die "Couldn't open $tmpfile for writing: $!";

    # If there are Windows line endings, that's automatically a fix.
    if ( $header_line =~ tr/\r//d ) {
        ++$CHANGES{windows_cr};
    }

    # Copy out the header line using the cleaned up headigns.
    print {$out} join( "\t",
        sort { $field_names{$a} <=> $field_names{$b} }
            keys %field_names )
        . "\n"
        or die "Couldn't write to $tmpfile: $!";

    while ( my $line = <$fh> ) {

        # Again, fixing Windows line endings.
        if ( $line =~ tr/\r//d ) {
            ++$CHANGES{windows_cr};
        }

        if ( not $line =~ m/\S/msx ) {

            # Just skip empty lines and cause the file to be
            # rewritten.
            ++$CHANGES{blank};
            next;
        }

        my @values = split /\t/, $line;
        chomp $values[-1];

        # Column # fixes. Either too many or too little.
        if ( scalar(@values) < $expected_columns ) {
            ++$CHANGES{col_count};
            push @values, ('') x ( $expected_columns - scalar @values 
+);
        }
        elsif ( scalar(@values) > $expected_columns ) {
            ++$CHANGES{col_count};
            if ( not $CLIP_EDGES ) {
                while ( $values[-1] eq ''
                    and scalar(@values) > $expected_columns )
                {
                    pop @values;
                }
            }
            else {
                splice @values, $expected_columns;
            }

            if ( scalar(@values) > $expected_columns ) {
                warn
                    "Too many columns in row $.. Expected $expected_co
+lumns, got @{[ scalar @values ]}.\n";
            }
        }

        print {$out} join( "\t", @values ) . "\n"
            or die "Couldn't write to $tmpfile: $!";
    }

    close $out
        or die "Couldn't flush $tmpfile: $!";

    if ( not keys %CHANGES ) {
        print "$file ok.\n";
        unlink $tmpfile
            or die "Couldn't remove unused $tmpfile: $!";
    }
    else {
        print "$file fixed.\n";

        # Report on several named things getting fixed. This just puts
        # nice names on the stuff.
        for my $change (
            [ windows_cr => 'Windows line endings' ],
            [ col_count  => 'Column count' ],
            [ blank      => 'Blank lines' ],
            [ fix_num    => 'Number formatting' ]
            )
        {
            my ( $field, $desc ) = @$change;
            my $fix = delete $CHANGES{$field};
            if ( not defined $fix ) {
                next;
            }

            print "$desc: $fix\n";
        }

        for ( sort grep { $CHANGES{$_} } keys %CHANGES ) {
            print "$_: $CHANGES{$_}\n";
        }

        my $backupfile = "$file.old";

        # Add a number to the .old to find a file name that isn't used
        # yet.
        while ( -e $backupfile ) {
            my ($id) = $backupfile =~ m/\.(\d+)$/msx;
            no warnings 'numeric';
            $id += 0;

            $backupfile =~ s/\d+$//msx;
            $backupfile .= ".$id";
        }
        rename $file, $backupfile
            or die "Couldn't rename $file to backupfile";
        rename $tmpfile, $file
            or die "Couldn't rename $tmpfile to $file";
    }

    return 1;
}

sub fix_excel_number {
    my $newnum = $_;
    my $oldnum = $_;

    $newnum =~ s/\A\s*($EXCEL_NUMBER)\s*\z/normalize_excel_number($1)/
+e;

    if ( $newnum ne $oldnum ) {
        $_ = $newnum;
        warn "Fix number $oldnum -> $newnum\n";
        ++$CHANGES{fix_num};
        return 1;
    }
    else {
        return 0;
    }
}

sub normalize_excel_number {
    my $num = shift @_;

    $num =~ tr/$,//d;
    $num =~ s/\A\((.+)\)\z/-$1/;
    return $num;
}
Replies are listed 'Best First'.
Re: Fixup tabular text file
by Scott7477 (Chaplain) on May 31, 2007 at 20:59 UTC

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://573022]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others exploiting the Monastery: (3)
As of 2024-03-28 17:17 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found