#!/usr/bin/perl
use strict;
use warnings FATAL => 'all';
no warnings 'uninitialized';
use Getopt::Long 'GetOptions';
use English '$PROGRAM_NAME';
use autouse 'Pod::Usage' => 'pod2usage';
our %CHANGES;
our $VERSION = ~0;
=head1 NAME
fix-tabs - Fixes some common problems in tab-delimited files
=head1 DESCRIPTION
This fixes some problems I encountered with tabular text files.
=over
=item * Removes empty trailing rows
=item * Adds missing trailing columns
=item * Warns if the header is missing columns
=item * Warns if the file isn't a text file
=item * Warns if there is only one column.
=item * Dos2unix line ending conversion
=item * Fixes Excel formatted numbers:
=over
=item * (...) parens around negative numbers
=item * Optional $ sign
=item * Optional commas
=back
=back
=head1 SYNOPSIS
fix-tabs [options] file1 file2 ...
Options:
--help Displays this message
--man Displays the manual
--clip Removes things off the right edge of the table
=cut
GetOptions(
help => sub { pod2usage( -verbose => 1 ) },
man => sub { pod2usage( -verbose => 2 ) },
clip => \our ($CLIP_EDGES),
)
or pod2usage( -verbose => 0 );
if ( not scalar @ARGV ) {
pod2usage( -verbose => 0 );
}
for my $file (@ARGV) {
fix_file($file);
}
exit;
# To regenerate the regex on the following line, run this
# command. It'll be uglier than what's below but that's because I made
# the one below prettier. It's still equivalent and is the source for
# the below.
#
# perl -MRegexp::Common -le 'print qr/(?:$RE{num}{real}|$RE{num}{int}|
+$RE{num}{real}{-sep=>','}{-group=>3}|$RE{num}{int}{-sep=>','}{-group=
+>3})/'
my $NUMBER;
BEGIN {
$NUMBER
= qr/(?x-ism:(?:(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])
(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])
(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)(?:[0123456789]+))|
(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]{1,3}
(?:(?:[,])[0123456789]{3})*)(?:(?:[.])(?:[0123456789]{0,}))?)
(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?:[+-]?)
(?:[0123456789]{1,3}(?:,[0123456789]{3})*))))/x;
}
my $EXCEL_NUMBER;
BEGIN {
$EXCEL_NUMBER = qr/(?xsm)
(?:
\( \$? $NUMBER \)
|
\$? $NUMBER
)/;
}
sub fix_file {
my ($file) = @_;
-T $file or die "$file isn't a text file.\n";
# Read the header line and get the # of columns I expect every
# other line to have.
open my $fh, '<', $file or die "Can't open $file: $!";
my $header_line = <$fh>;
$header_line =~ m/\t/mx
or die "$file isn't a tab delimited text file.\n";
my %field_names;
{
my @labels = split /\t/, $header_line;
chomp $labels[-1];
for my $index ( 0 .. $#labels ) {
my $column_name = $labels[$index];
if ( $column_name =~ s/\A\s+// ) {
++$CHANGES{header_whitespace};
}
if ( $column_name =~ s/\s+\z// ) {
++$CHANGES{header_whitespace};
}
$field_names{$column_name} = $index;
}
}
my $expected_columns = scalar keys %field_names;
# The input file will be copied here. This shouldn't be left
# around after the program is finished.
my $tmpfile = "$file.tmp";
open my $out, '>', $tmpfile
or die "Couldn't open $tmpfile for writing: $!";
# If there are Windows line endings, that's automatically a fix.
if ( $header_line =~ tr/\r//d ) {
++$CHANGES{windows_cr};
}
# Copy out the header line using the cleaned up headigns.
print {$out} join( "\t",
sort { $field_names{$a} <=> $field_names{$b} }
keys %field_names )
. "\n"
or die "Couldn't write to $tmpfile: $!";
while ( my $line = <$fh> ) {
# Again, fixing Windows line endings.
if ( $line =~ tr/\r//d ) {
++$CHANGES{windows_cr};
}
if ( not $line =~ m/\S/msx ) {
# Just skip empty lines and cause the file to be
# rewritten.
++$CHANGES{blank};
next;
}
my @values = split /\t/, $line;
chomp $values[-1];
# Column # fixes. Either too many or too little.
if ( scalar(@values) < $expected_columns ) {
++$CHANGES{col_count};
push @values, ('') x ( $expected_columns - scalar @values
+);
}
elsif ( scalar(@values) > $expected_columns ) {
++$CHANGES{col_count};
if ( not $CLIP_EDGES ) {
while ( $values[-1] eq ''
and scalar(@values) > $expected_columns )
{
pop @values;
}
}
else {
splice @values, $expected_columns;
}
if ( scalar(@values) > $expected_columns ) {
warn
"Too many columns in row $.. Expected $expected_co
+lumns, got @{[ scalar @values ]}.\n";
}
}
print {$out} join( "\t", @values ) . "\n"
or die "Couldn't write to $tmpfile: $!";
}
close $out
or die "Couldn't flush $tmpfile: $!";
if ( not keys %CHANGES ) {
print "$file ok.\n";
unlink $tmpfile
or die "Couldn't remove unused $tmpfile: $!";
}
else {
print "$file fixed.\n";
# Report on several named things getting fixed. This just puts
# nice names on the stuff.
for my $change (
[ windows_cr => 'Windows line endings' ],
[ col_count => 'Column count' ],
[ blank => 'Blank lines' ],
[ fix_num => 'Number formatting' ]
)
{
my ( $field, $desc ) = @$change;
my $fix = delete $CHANGES{$field};
if ( not defined $fix ) {
next;
}
print "$desc: $fix\n";
}
for ( sort grep { $CHANGES{$_} } keys %CHANGES ) {
print "$_: $CHANGES{$_}\n";
}
my $backupfile = "$file.old";
# Add a number to the .old to find a file name that isn't used
# yet.
while ( -e $backupfile ) {
my ($id) = $backupfile =~ m/\.(\d+)$/msx;
no warnings 'numeric';
$id += 0;
$backupfile =~ s/\d+$//msx;
$backupfile .= ".$id";
}
rename $file, $backupfile
or die "Couldn't rename $file to backupfile";
rename $tmpfile, $file
or die "Couldn't rename $tmpfile to $file";
}
return 1;
}
sub fix_excel_number {
my $newnum = $_;
my $oldnum = $_;
$newnum =~ s/\A\s*($EXCEL_NUMBER)\s*\z/normalize_excel_number($1)/
+e;
if ( $newnum ne $oldnum ) {
$_ = $newnum;
warn "Fix number $oldnum -> $newnum\n";
++$CHANGES{fix_num};
return 1;
}
else {
return 0;
}
}
sub normalize_excel_number {
my $num = shift @_;
$num =~ tr/$,//d;
$num =~ s/\A\((.+)\)\z/-$1/;
return $num;
}
|