$ tar xf $HOME/Downloads/kyotocabinet-1.2.77.tar.gz
$ cd kyotocabinet-1.2.77
$ patch -p0 < $HOME/Downloads/patch-configure.diff
$ patch -p0 < $HOME/Downloads/patch-kcthread.cc
$ patch -p0 < $HOME/Downloads/patch-kccommon.h.diff
$ ./configure --disable-lzo --disable-lzma
$ make -j2
$ sudo make install
$ cd ..
$ tar xf $HOME/Downloads/kyotocabinet-perl-1.20.tar.gz
$ cd kyotocabinet-perl-1.20
$ perl Makefile.PL
$ make
$ make test
$ sudo make install
$ cd doc
$ open index.html in a web browser
$ cd ../..
Demonstration
I created approximately 10 and 17 million records for input_a.txt and input_b.txt, respectively, using the following scripts. The input files remain sorted.
use strict;
use warnings;
open my $fh_a, '>', 'input_a.txt' or die "open error: $!";
foreach my $i ( 1 .. 333_333 * 10 ) {
my $prefix = sprintf('%09s', $i);
print $fh_a "${prefix}D x\n";
print $fh_a "${prefix}D y\n";
print $fh_a "${prefix}E t\n";
}
close $fh_a;
use strict;
use warnings;
open my $fh_b, '>', 'input_b.txt' or die "open error: $!";
foreach my $i ( 1 .. 333_333 * 10 ) {
my $prefix = sprintf('%09s', $i);
print $fh_b "${prefix}D m\n";
print $fh_b "${prefix}D n\n";
print $fh_b "${prefix}D o\n";
print $fh_b "${prefix}E m\n";
print $fh_b "${prefix}E s\n";
}
close $fh_b;
Afterwards, run stage_a.pl to create the Kyoto Cabinet database. This is quite fast and stores 10 million records in little time.
use strict;
use warnings;
use KyotoCabinet;
# construct the database object
my $db = KyotoCabinet::DB->new();
# open the database
if (! $db->open('input_a.kct#pccap=256m', $db->OWRITER | $db->OCREATE)
+) {
die "open error (db): ", $db->error;
}
# open input_file a
open my $fh_a, '<', 'input_a.txt' or die "open error (txt): $!";
my $key_a = '';
my $val_a = '';
sub db_store {
# store key-value pair
if (! $db->set($key_a, $val_a)) {
warn "db set error: ", $db->error, "\n";
}
}
while ( my $line = <$fh_a> ) {
chomp $line;
my ($key, $val) = split(' ', $line, 2);
if ($key ne $key_a) {
db_store() if $key_a;
$key_a = $key;
$val_a = '';
}
$val_a .= ($val_a) ? " $val" : $val;
}
db_store() if $key_a;
close $fh_a;
$db->close;
Now the fun part. That would be stage_b.pl (note: direct output to a file). This too is fast and completes in less than a minute on my laptop.
use strict;
use warnings;
use KyotoCabinet;
# construct the database object
my $db = KyotoCabinet::DB->new();
# open the database
if (! $db->open('input_a.kct#pccap=256m', $db->OREADER)) {
die "open error (db): ", $db->error;
}
# open input_file b
open my $fh_b, '<', 'input_b.txt' or die "open error (txt): $!";
my $key_b = '';
my @val_b = '';
sub output_key {
# process key
foreach my $val_a ( split(' ', $db->get($key_b)) ) {
foreach my $val_b ( @val_b ) {
print "$key_b $val_a $val_b\n";
}
}
}
while ( my $line = <$fh_b> ) {
chomp $line;
my ($key, $val) = split(' ', $line, 2);
if ($key ne $key_b) {
output_key() if $key_b;
$key_b = $key;
@val_b = ();
}
push @val_b, $val;
}
output_key() if $key_b;
close $fh_b;
$db->close;
Files and Output
Here is the ls -lh and wc -l output. There are some big files.
$ ls -lh
total 1705800
-rw-r--r-- 1 mario staff 96M Dec 1 01:17 input_a.kct
-rw-r--r-- 1 mario staff 281B Dec 1 01:15 input_a.pl
-rw-r--r-- 1 mario staff 124M Dec 1 01:15 input_a.txt
-rw-r--r-- 1 mario staff 349B Dec 1 01:15 input_b.pl
-rw-r--r-- 1 mario staff 207M Dec 1 01:16 input_b.txt
-rw-r--r-- 1 mario staff 381M Dec 1 01:18 output.txt
-rw-r--r-- 1 mario staff 797B Dec 1 01:12 stage_a.pl
-rw-r--r-- 1 mario staff 824B Dec 1 01:10 stage_b.pl
$ wc -l *.txt
9999990 input_a.txt
16666650 input_b.txt
26666640 output.txt
53333280 total
Finally, the head of output.txt.
000000001D x m
000000001D x n
000000001D x o
000000001D y m
000000001D y n
000000001D y o
000000001E t m
000000001E t s
000000002D x m
000000002D x n
000000002D x o
000000002D y m
000000002D y n
000000002D y o
000000002E t m
000000002E t s
000000003D x m
000000003D x n
000000003D x o
000000003D y m
000000003D y n
000000003D y o
000000003E t m
000000003E t s
...
One may choose another key-value database for stage_a. I do not understand how 1.7 trillion records is computed for your case. Well, I gave this a try and had to look up Kyoto Cabinet. I'm on the Mac and applied the three patches.
Regards, Mario |