Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

Generic database table analysis

by rinceWind (Monsignor)
on May 20, 2002 at 17:15 UTC ( [id://167879]=sourcecode: print w/replies, xml ) Need Help??
Category: Database
Author/Contact Info rinceWind
Description: I have lost count of the number of times I have been asked "What's in the database" or "Can I see the data", by business analysts and users. If the DB table is small enough I usually give them an Excel spreadsheet with the whole table in.

This script caters for larger tables, which would take forever to produce - and not be what the user wanted anyway. What the code produces is an analysis of the field values occurring in every field - descending frequencies.

#!/usr/bin/perl

use strict;
use DBI;

use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect
+_pass);
use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_thres
+hold_percentile);

#-----------------------------------------------------------
# Site-specific tailoring:
#
# Configure the following for your relational database
#
my $host = $ENV{ORACLE_HOST} || `hostname`;
chomp $host;
$db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}";
$db_connect_user = $ENV{PMS_LOGIN};
$db_connect_pass = $ENV{PMS_PASSWD};

%ODBC_type = (
    3 => 'Numeric',
    12 => 'Character',
);
#
# The values below affect the output of the analysis.
$anal_distinct_values_limit = 100; # Display everything for fewer that
+ this number of values
$anal_dribble_rows = 5;           # Show isolated values if this accou
+nts for < threshold
$anal_threshold_percentile = 1;    # Display everything for freqs > th
+is percentile
# For a column which has fewer than $anal_distinct_values_limit values
+, all values and 
# counts will be shown. If there are more than this number of values, 
+all values that have
# a count of more than $anal_threshold_percentile as a percent of the 
+number of rows, are 
# output with their counts.
#
# Also, the tail of the distribution - all values with fewer than $ana
+l_dribble_rows counts
# are output - unless there are too many of them.
#
# This script also detects and reports unique values

$| = 1;

print "Connecting to database...\n";
my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect
+_pass,
        {RaiseError=>1,AutoCommit=>1});

if (!@ARGV) {
    print <<END;

Usage: perl dbanal.pl schema table [table...]

Script creates files of the form table.anal in the current working dir
+ectory

END

    exit 1;
}

my $schema = shift @ARGV;

foreach my $table (@ARGV) {
    my $outf = $table.'.anal';
    open OUT,">$outf" or die "Failed to create $outf: $!\n";

    print "\nTable $table ";

# No rows query to get the column details

    my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1"
+);
    $sth->execute;

    my @columns = @{$sth->{NAME}};
    my @types = @{$sth->{TYPE}};

# Now get the row count

    my $rows = $dbh->selectall_arrayref("select count(*) from $schema.
+$table")->[0][0];

# Iterate around columns

    COL:
    foreach my $col (@columns) {
        print ".";

# Get column's data type

        my $type = shift @types;
        my $typename = $ODBC_type{$type} || "Unknown data type $type";

        print OUT "\n$col($typename)\n";

# Now get the distribution for the column

        my $dist = $dbh->selectall_arrayref
            ("select $col,count(*) from $schema.$table group by $col")
+;
# Make it ranked:
# Sort by count(*) descending, then value ascending. 

        my @dist_ranked = sort {$b->[1] <=> $a->[1] ||
            (($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[
+0]))} @$dist;

# Put quotes round values if it's not a numeric

        @dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked
            if $type != 3;

        if (@dist_ranked < $anal_distinct_values_limit) {

# Easy case, few distinct values.
            &output_val($rows,@$_) foreach (@dist_ranked);
        } else {

# Detect unique values
            if ($dist_ranked[0][1] == 1) {
                &output_val($rows, "Unique values");
            } else {

# Print top of distribution
                my $phrase = "Top";
                my $accum_freq = 0;
                while ($dist_ranked[0][1] * 100 / $rows 
                        > $anal_threshold_percentile) {
                    my $val = shift @dist_ranked;
                    &output_val($rows ,@$val);
                    $phrase = "Next lowest";
                    $accum_freq += $val->[1];
                }

# and print the point at which it tails off
                    &output_val($rows, "$phrase freq", $dist_ranked[0]
+[1]);

# Now find if there is a tail worth printing out

                while ((($rows - $accum_freq) * 100 / $rows >
                    $anal_threshold_percentile) ||
                    ($dist_ranked[0][1] > $anal_dribble_rows) ||
                    (@dist_ranked > $anal_distinct_values_limit)) {

                    my $val = shift @dist_ranked;
                    $accum_freq += $val->[1];
                    next COL if !@dist_ranked;
                    redo if $dist_ranked[0][1] == $val->[1];
                }
                print OUT " "x20,"Tail values\n" if @dist_ranked;
                foreach (@dist_ranked) {
                    &output_val($rows ,@$_);
                }
            }
        }
    }
    close OUT;
}
print "\n";
$dbh->disconnect;

# output_val - do the formatting here ( called from several places)

sub output_val {
    my ($total,$value,$freq) = @_;

    printf OUT "           %-30s %10d(%3.1f\%)\n",$value,$freq,100*$fr
+eq/$total;
}

Edited 2002-05-20 by Ovid

Replies are listed 'Best First'.
Re: Generic database table analysis
by gav^ (Curate) on May 21, 2002 at 01:38 UTC
    Perhaps shortening analysis to 'anal' isn't such a good idea when you end up with $anal_dribble_rows?

    :)

    gav^

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://167879]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others learning in the Monastery: (3)
As of 2024-04-25 06:42 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found