Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

Re^2: Useful heuristics for analyzing arrays of data to determine column header

by nysus (Parson)
on Feb 15, 2019 at 11:52 UTC ( [id://1229948]=note: print w/replies, xml ) Need Help??


in reply to Re: Useful heuristics for analyzing arrays of data to determine column header
in thread Useful heuristics for analyzing arrays of data to determine column header

Thanks. I will try this out.

And here is some simple code I have so far. It just loops through the column and collects characteristics of the column into attributes that might be useful. For example, one thing it does is determine if the row is sorted. If it is, and the header is not in the proper sort order, it sets a "has_sort_mismatch" flag. One thing the code accounts for is that the header might be multiple rows anywhere in the first 5 rows of the spreadsheet. That makes things slightly trickier.

package Data::Column ; use Moose; use Scalar::Util qw(looks_like_number); use Log::Log4perl::Shortcuts qw (:all); use namespace::autoclean; has 'first_cell' => (is => 'ro', isa => 'Defined', lazy => +1, default => 0, writer => '_set_first_cell' ); has 'blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_blank_count' ); has 'mixed_case_count' => (is => 'ro', isa => 'Int', lazy => + 1, default => 0, writer => '_set_mixed_case_count' ); has 'upper_case_count' => (is => 'ro', isa => 'Int', lazy => + 1, default => 0, writer => '_set_upper_case_count' ); has 'lower_case_count' => (is => 'ro', isa => 'Int', lazy => + 1, default => 0, writer => '_set_lower_case_count' ); has 'preceding_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_preceding_blank_count' ); has 'following_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_following_blank_count' ); has 'unique_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_unique_count' ); has 'strings_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_strings_count' ); has 'min_length' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_min_length' ); has 'max_length' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_max_length' ); has 'numbers_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_numbers_count' ); has 'percent_blank' => (is => 'ro', isa => 'Num', lazy => 1, d +efault => 0, writer => '_set_percent_blank' ); has 'percent_strings' => (is => 'ro', isa => 'Num', lazy => 1, d +efault => 0, writer => '_set_percent_strings' ); has 'percent_numbers' => (is => 'ro', isa => 'Num', lazy => 1, d +efault => 0, writer => '_set_percent_numbers' ); has 'non_blank_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_non_blank_count' ); has 'spaces_count' => (is => 'ro', isa => 'Int', lazy => 1, d +efault => 0, writer => '_set_spaces_count' ); has 'is_all_strings' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_is_all_strings' ); has 'has_sort_mismatch' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_has_sort_mismatch' ); has 'is_all_numbers' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_is_all_numbers' ); has 'is_pure' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_is_pure' ); has 'is_sorted' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_is_sorted' ); has 'is_mixed' => (is => 'ro', isa => 'Bool', lazy => 1, +default => 0, writer => '_set_is_mixed' ); has 'first_five_rows' => (is => 'ro', isa => 'ArrayRef[Int]', la +zy => 1, default => 0, writer => '_set_first_five_rows' ); has 'data' => (traits => ['Array'], is => 'ro', requi +red => 1, isa => 'ArrayRef', default => sub { [] + }, handles => { count => 'count', is_empt +y => 'is_empty', elements => 'elements', get_data => 'get' }, ); sub BUILD { my $s = shift; my ($blank_count, $string_count, $number_count, $strings_with_spaces +) = (0) x 4; my $last_cell; my $count = 0; my %uniques = (); my $sort_order = ''; my $current_sort_order; my $is_sorted = 1; my $min_length = 0; my $max_length = 0; my $first_non_blank_data_cell = 0; my @first_five_rows = (); my $preceding_blank_row_count = 0; my $following_blank_row_count = 0; my $lower_case_count = 0; my $upper_case_count = 0; my $mixed_case_count = 0; foreach my $cell ( $s->elements ) { $count++; my $is_string = 0; if (!$cell) { $blank_count++; push (@first_five_rows, 0) if $count < 6; next; } if (!$first_non_blank_data_cell && $count > 5) { $first_non_blank_data_cell = $count; } push (@first_five_rows, 1) if $count < 6; if (length $cell < $min_length || !$min_length) { $min_length = length $cell; } if (length $cell > $max_length || !$max_length) { $max_length = length $cell; } $uniques{$cell} = 1; if ( !looks_like_number($cell) ) { $string_count++; $is_string = 1; my $has_lower = $cell =~ /[a-z]/; my $has_upper = $cell =~ /[A-Z]/; $mixed_case_count++ if ($has_lower && $has_upper); $lower_case_count++ if $has_lower && !$has_upper; $upper_case_count++ if $has_upper && !$has_lower; } elsif ($cell) { $number_count++; } if ($cell =~ / /) { $strings_with_spaces++; } if ($is_sorted && $count > 4 && $last_cell && ( $cell ne $last_cel +l ) ) { if ($is_string) { if ($cell gt $last_cell) { $current_sort_order = 'asc'; } else { $current_sort_order = 'desc'; } } else { if ($cell > $last_cell) { $current_sort_order = 'asc'; } else { $current_sort_order = 'desc'; } } $sort_order = $current_sort_order if !$sort_order; if ( $sort_order && $sort_order ne $current_sort_order) { $is_sorted = 0; } } elsif ($count > 4) { $last_cell = $cell; } } my $non_blank_count = $s->count - $blank_count; if ($non_blank_count == $string_count && $string_count) { $s->_set_is_all_strings(1); $s->_set_is_pure(1); } elsif ($non_blank_count == $number_count && $number_count) { $s->_set_is_all_numbers(1); $s->_set_is_pure(1); } else { $s->_set_is_mixed(1); } my $first_non_blank_row = 0; my $row_count = 0; foreach my $row (@first_five_rows) { if ($row) { $first_non_blank_row = $row_count; last; } $row_count++; } foreach my $row (@first_five_rows[$row_count + 1, 4]) { if ($row && $following_blank_row_count) { last; } if (!$row) { $following_blank_row_count++; } } my $sort_mismatch = 0; my $first_cell = $s->get_data($first_non_blank_row); my $first_cell_is_all_upper = 0; my $first_cell_is_all_lower = 0; my $first_cell_is_mixed = 0; if (!looks_like_number($first_cell)) { my $first_cell_has_lower = $first_cell =~ /[a-z]/; my $first_cell_has_upper = $first_cell =~ /[A-Z]/; my $first_cell_is_mixed if ($first_cell_has_lower && $first_cell_h +as_upper); if (!$first_cell_is_mixed) { if ($first_cell =~ /[a-z]/) { $first_cell_is_all_lower = 1; } elsif ($first_cell =~ /[A-Z]/) { $first_cell_is_all_upper = 1; } else { $first_cell_is_mixed = 1; } } } if ($is_sorted) { my $first_cell_is_greater; if (looks_like_number $first_cell) { $first_cell_is_greater = $first_cell > $first_non_blank_data_cel +l; } else { $first_cell_is_greater = $first_cell gt $first_non_blank_data_ce +ll; } logd($first_cell_is_greater); if ($first_cell_is_greater && $sort_order eq 'asc') { logd('hi'); $sort_mismatch = 1; } if (!$first_cell_is_greater && $sort_order eq 'desc') { $sort_mismatch = 1; } } my $non_blank = $s->count - $blank_count; $s->_set_non_blank_count($non_blank); $s->_set_has_sort_mismatch($sort_mismatch); $s->_set_mixed_case_count($mixed_case_count); $s->_set_upper_case_count($upper_case_count); $s->_set_lower_case_count($lower_case_count); $s->_set_percent_blank($blank_count / $s->count * 100); $s->_set_percent_strings($string_count / $non_blank * 100); $s->_set_percent_numbers($number_count / $non_blank * 100); $s->_set_spaces_count($strings_with_spaces); $s->_set_blank_count($blank_count); $s->_set_max_length($max_length); $s->_set_min_length($min_length); $s->_set_strings_count($string_count); $s->_set_numbers_count($number_count); $s->_set_unique_count(scalar keys %uniques); $s->_set_preceding_blank_count($first_non_blank_row); $s->_set_following_blank_count($following_blank_row_count); $s->_set_is_sorted($is_sorted) if $s->count > 6; $s->_set_first_five_rows(\@first_five_rows); $s->_set_first_cell($first_cell); } sub has_blanks { my $s = shift; return $s->blank_count > 0; }

$PM = "Perl Monk's";
$MCF = "Most Clueless Friar Abbot Bishop Pontiff Deacon Curate Priest Vicar";
$nysus = $PM . ' ' . $MCF;
Click here if you love Perl Monks

  • Comment on Re^2: Useful heuristics for analyzing arrays of data to determine column header
  • Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://1229948]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others chilling in the Monastery: (3)
As of 2024-04-26 08:17 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found