#!/usr/bin/perl
use strict;
use DBI;
use vars qw(%ODBC_type $db_connect_string $db_connect_user $db_connect
+_pass);
use vars qw($anal_distinct_values_limit $anal_dribble_rows $anal_thres
+hold_percentile);
#-----------------------------------------------------------
# Site-specific tailoring:
#
# Configure the following for your relational database
#
my $host = $ENV{ORACLE_HOST} || `hostname`;
chomp $host;
$db_connect_string = "dbi:Oracle:host=$host;sid=$ENV{ORACLE_SID}";
$db_connect_user = $ENV{PMS_LOGIN};
$db_connect_pass = $ENV{PMS_PASSWD};
%ODBC_type = (
3 => 'Numeric',
12 => 'Character',
);
#
# The values below affect the output of the analysis.
$anal_distinct_values_limit = 100; # Display everything for fewer that
+ this number of values
$anal_dribble_rows = 5; # Show isolated values if this accou
+nts for < threshold
$anal_threshold_percentile = 1; # Display everything for freqs > th
+is percentile
# For a column which has fewer than $anal_distinct_values_limit values
+, all values and
# counts will be shown. If there are more than this number of values,
+all values that have
# a count of more than $anal_threshold_percentile as a percent of the
+number of rows, are
# output with their counts.
#
# Also, the tail of the distribution - all values with fewer than $ana
+l_dribble_rows counts
# are output - unless there are too many of them.
#
# This script also detects and reports unique values
$| = 1;
print "Connecting to database...\n";
my $dbh = DBI->connect($db_connect_string,$db_connect_user,$db_connect
+_pass,
{RaiseError=>1,AutoCommit=>1});
if (!@ARGV) {
print <<END;
Usage: perl dbanal.pl schema table [table...]
Script creates files of the form table.anal in the current working dir
+ectory
END
exit 1;
}
my $schema = shift @ARGV;
foreach my $table (@ARGV) {
my $outf = $table.'.anal';
open OUT,">$outf" or die "Failed to create $outf: $!\n";
print "\nTable $table ";
# No rows query to get the column details
my $sth = $dbh->prepare("select * from $schema.$table where 0 = 1"
+);
$sth->execute;
my @columns = @{$sth->{NAME}};
my @types = @{$sth->{TYPE}};
# Now get the row count
my $rows = $dbh->selectall_arrayref("select count(*) from $schema.
+$table")->[0][0];
# Iterate around columns
COL:
foreach my $col (@columns) {
print ".";
# Get column's data type
my $type = shift @types;
my $typename = $ODBC_type{$type} || "Unknown data type $type";
print OUT "\n$col($typename)\n";
# Now get the distribution for the column
my $dist = $dbh->selectall_arrayref
("select $col,count(*) from $schema.$table group by $col")
+;
# Make it ranked:
# Sort by count(*) descending, then value ascending.
my @dist_ranked = sort {$b->[1] <=> $a->[1] ||
(($type == 3) ? ($a->[0] <=> $b->[0]) : ($a->[0] cmp $b->[
+0]))} @$dist;
# Put quotes round values if it's not a numeric
@dist_ranked = map {$_->[0] = "'$_->[0]'";$_} @dist_ranked
if $type != 3;
if (@dist_ranked < $anal_distinct_values_limit) {
# Easy case, few distinct values.
&output_val($rows,@$_) foreach (@dist_ranked);
} else {
# Detect unique values
if ($dist_ranked[0][1] == 1) {
&output_val($rows, "Unique values");
} else {
# Print top of distribution
my $phrase = "Top";
my $accum_freq = 0;
while ($dist_ranked[0][1] * 100 / $rows
> $anal_threshold_percentile) {
my $val = shift @dist_ranked;
&output_val($rows ,@$val);
$phrase = "Next lowest";
$accum_freq += $val->[1];
}
# and print the point at which it tails off
&output_val($rows, "$phrase freq", $dist_ranked[0]
+[1]);
# Now find if there is a tail worth printing out
while ((($rows - $accum_freq) * 100 / $rows >
$anal_threshold_percentile) ||
($dist_ranked[0][1] > $anal_dribble_rows) ||
(@dist_ranked > $anal_distinct_values_limit)) {
my $val = shift @dist_ranked;
$accum_freq += $val->[1];
next COL if !@dist_ranked;
redo if $dist_ranked[0][1] == $val->[1];
}
print OUT " "x20,"Tail values\n" if @dist_ranked;
foreach (@dist_ranked) {
&output_val($rows ,@$_);
}
}
}
}
close OUT;
}
print "\n";
$dbh->disconnect;
# output_val - do the formatting here ( called from several places)
sub output_val {
my ($total,$value,$freq) = @_;
printf OUT " %-30s %10d(%3.1f\%)\n",$value,$freq,100*$fr
+eq/$total;
}
Edited 2002-05-20 by Ovid |