Hi,
I'm a relative newbie with perl. I've managed to create a script that reads from a few DB_FILE hashes and inserts its contents into a mysql database.
Problem is that it's very slow. There's probably 650k records in the DB_FILE, and it takes many hours to finish. I can see a few fundamental ways to optimize this, but none that are going to really speed it up. I'm not even sure where the real bottlenecks are, because the processors are basically idle, the box has 8GB of RAM that's mostly unused, and the disks are also mostly idle. The box is otherwise not doing anything.
The script is designed to rsync the hashes from two remote mail hosts which write to the DB_FILES the output from amavisd and spamassassin to build a quarantine database.
Can I separate the script into two, so the DB_FILE databases are written to mysql simultaneously? Will it automatically figure out the potential locking issues doing that?
Can someone recommend or provide some ideas for optimizing the inserts into mysql with my script?
#!/usr/bin/perl -w
# use perl;
use DB_File;
use DBI;
use File::Basename qw(basename);
use strict;
use vars qw($verbose);
#
# Prototypes
#
sub DBG($);
my $me = basename($0); $me =~ s/\.pl$//;
my $basedir = '/var/www/myhost.com/';
my $lockfile = '/var/run/db-fetch-and-merge.lock';
my $key = $basedir . '/keys/mail-sync-key-2012';
my $user = 'amavis';
my $t_timeout = 5 * 60;
my $rc = 0;
my $cnt = 0;
my $steps = 100;
$verbose = shift || 1;
my $db = {
'user' => 'quarantine',
'pass' => 'my%quar',
'host' => 'localhost',
'db' => 'quarantine',
};
#
# Retrieve the quarantine database files from the two mail hosts.
#
foreach my $host (qw(mail01 mail02)) {
print "rsyncing: $host\n";
my $cmd = sprintf("/usr/bin/rsync -avq -e 'ssh -c blowfish -l %
+s -i %s' --delete %s %s",
$user,
$key,
$host . '.myhost.com:/var/spool/amavisd/qdb/*.db',
$basedir . '/' . $host . '/'
);
if (system($cmd) != 0) {
print STDERR "An error occured while syncing $host: $!\n"; $rc =
+ 1; goto done;
}
}
#
# Connect to the MySQL database and empty out the 'entry' table.
#
my $dbh = DBI->connect("DBI:mysql:database=$db->{'db'};host=$db->
+{'host'}",
$db->{'user'}, $db->{'pass'}, { 'PrintError' => 1, });
$dbh->do('DELETE FROM entry');
#
# XXX
#
foreach my $host (qw(mail01 mail02)) {
my $src = $basedir . '/' . $host;
print "processing: $src\n";
for (my $i = 0; $i < 256; $i++) {
my $file = sprintf('%s/%02x.db', $src, $i);
my %hash;
eval {
$SIG{'ALRM'} = sub { die "alarm\n" }; alarm($t_timeout);
unless (tie (%hash, 'DB_File', $file, O_RDONLY, 0600)) {
print STDERR "Can't open db '$file': $!\n"; $rc = 1; next;
}
foreach my $filename (keys %hash) {
next unless $hash{$filename};
my @tmp = split /\t/, $hash{$filename}, 7;
my $type = $tmp[0];
my $dt = $tmp[1];
my $size = $tmp[2];
my $from = $tmp[3];
my $to = $tmp[4];
my $subj = $tmp[5];
my $score = $tmp[6];
if ($type eq "virus") {
$score = 100;
}
my $datestamp = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
($dt =~ m/^([0-9]{4})([0-9]{2})([0-9]{2})T([0-9]{2})([0-9
+]{2})([0-9]{2}).*$/));
if($host eq '') { $host = "NOTDEFINED"; };
if($filename eq '') { $filename = "NOTDEFINED"; };
if($datestamp eq '') { $datestamp = "NOTDEFINED"; };
if($type eq '') { $type = "NOTDEFINED"; };
if($size eq '') { $size = "NOTDEFINED"; };
if($from eq '') { $from = "NOTDEFINED"; };
if($to eq '') { $to = "NOTDEFINED"; };
if($subj eq '') { $subj = "NOTDEFINED"; };
if($score eq '') { $score = "NOTDEFINED"; };
printf("|%.15s|%.20s|%.20s|%.04s|%0.04s|%0.10s|%0.10s|%0.20s|%0.
+05s|\n",$host,$filename,$datestamp,$type,$size,$from,$to,$subj,$score
+);
my $sql = sprintf('INSERT INTO entry VALUES(%s, %s, %s, %
+s, %s, %s, %s, %s, %s)',
$dbh->quote($host),
$dbh->quote($filename),
$dbh->quote($datestamp),
$dbh->quote($type),
$dbh->quote($size),
$dbh->quote($from),
$dbh->quote($to),
$dbh->quote($subj),
$dbh->quote($score)
);
$dbh->do($sql);
if (($cnt % $steps) == 0) { DBG(sprintf("\e[8D%-8d\n", $cnt)
+); } $cnt++;
} # end inside foreach
print "\n";
untie %hash;
alarm(0);
};
if ($@ eq "alarm\n") {
print STDERR "Can't open db '$file' (timeout): $!\n"; $rc = 1;
+ next;
}
} # end for hash for each host
} # end foreach
#
# Disconnect from the database, remove our lock file, and exit quiet
+ly.
#
done:
$dbh->disconnect() if ($dbh);
unlink $lockfile;
exit 0;
######################################################################
+##########
sub DBG($) { my $msg = shift; print $msg; }