This is a continuation of Re: Problem in RAM usage while threading the program.
Update: Added Inline::CPP code.
Part Seven Inline::C and Inline::CPP demonstrations.
There was one more thing to try. Of all things, Inline::C / CPP on Linux to see what improvements that would bring.
Inline::C
# https://fallabs.com/kyotocabinet/api/
use Inline C => config => inc => '-I/usr/local/include';
use Inline C => config => libs => '-L/usr/local/lib -lkyotocabinet';
use Inline C => <<'EOC';
#include <string.h>
#include <kclangc.h>
KCDB *db;
KCCUR *cur;
// open the database
int open_db(char* file) {
db = kcdbnew();
if (!kcdbopen(db, file, KCOREADER | KCONOLOCK)) return 0;
cur = kcdbcursor(db);
return 1;
}
// close the database
int close_db() {
if (cur) kccurdel(cur);
if (!kcdbclose(db)) return 0;
return 1;
}
// search records containing substring
SV * search_db(char* substr) {
AV *ret = newAV();
char *kbuf, *vbuf;
size_t ksiz, vsiz;
const char *cvbuf;
kccurjump(cur);
while ((kbuf = kccurget(cur, &ksiz, &cvbuf, &vsiz, 1)) != NULL) {
if (strstr(cvbuf, substr) != NULL) {
av_push(ret, newSVpvn(kbuf, ksiz));
}
kcfree(kbuf);
}
return newRV_noinc((SV *) ret);
}
EOC
Inline::CPP
# https://fallabs.com/kyotocabinet/api/
use Inline CPP => config => inc => '-I/usr/local/include';
use Inline CPP => config => libs => '-L/usr/local/lib -lkyotocabinet';
use Inline CPP => <<'EOCPP';
#undef do_open
#undef do_close
#include <string.h>
#include <kcpolydb.h>
using namespace std;
using namespace kyotocabinet;
PolyDB db;
DB::Cursor *cur;
// open the database
int open_db(char* file) {
if (!db.open(file, PolyDB::OREADER | PolyDB::ONOLOCK)) return 0;
cur = db.cursor();
return 1;
}
// close the database
int close_db() {
if (cur) delete cur;
if (!db.close()) return 0;
return 1;
}
// search records containing substring
SV * search_db(char* substr) {
AV *ret = newAV();
string ckey, cvalue;
cur->jump();
while (cur->get(&ckey, &cvalue, true)) {
if (strstr(cvalue.c_str(), substr) != NULL) {
av_push(ret, newSVpvn(ckey.c_str(), ckey.length()));
}
}
return newRV_noinc((SV *) ret);
}
EOCPP
Serial
# https://www.perlmonks.org/?node_id=11110379
# usage: perl search_db_inline_c.pl > Outfile.txt
use strict;
use warnings;
# insert the Inline::C or Inline::CPP code here
open_db('db.kch#msiz=128m') or die "db.kch: open error\n";
open my $fh, '<', 'peptides.txt' or die "open error: $!\n";
while ( my $pep = <$fh> ) {
chomp $pep; my $ids = search_db($pep);
print "$pep\t", join(',', @$ids), "\n" if @$ids;
}
close $fh;
close_db();
Parallel
# https://www.perlmonks.org/?node_id=11110379
# usage: perl search_db_inline_c_mce.pl > Outfile.txt
use strict;
use warnings;
# insert the Inline::C or Inline::CPP code here
use MCE;
my $mce = MCE->new(
max_workers => MCE::Util::get_ncpu(),
chunk_size => 1,
init_relay => 1,
user_begin => sub {
open_db('db.kch#msiz=128m') or die "db.kch: open error\n";
},
user_end => sub {
close_db();
},
user_func => sub {
my $pep = $_; chomp $pep;
my $ids = search_db($pep);
# output serially, one worker at a time
MCE::relay {
print "$pep\t", join(',', @$ids), "\n" if @$ids;
};
}
);
$mce->process('peptides.txt');
$mce->shutdown;
Outfile.txt
GAAGGACTGGGACCA >NR_000001,>NR_006611
AGGCTGCGGCAGGAC >NR_062102
GTGAGCCGGGCAGAG >NR_089584
AGGGGGGGTTGCTGA >NR_036454,>NR_068535,>NR_097889
CTGACATGCGGCGCA >NR_087289
GTGCATCGATGGCCG >NR_005535
GGGGTCAAGCGAACC >NR_076289,>NR_087856
TGAGACGGCGAACCT >NR_064242
AGCGACAAAGGAAAC >NR_045865
AGGTGCAACCATGGA >NR_046602,>NR_056869
GAGTAAACCGCGCGA >NR_093455
AACGACTGAACAGCG >NR_070693
ACGCGTAATTCGATA >NR_080086
GATGAGCGGAGCACT >NR_070118
CGTAGCGAAACCGAG >NR_092384,>NR_098291
GGGGGGGAGGTCCGA >NR_021671,>NR_036907,>NR_080961
AGGGAGGGGGGTTGT >NR_026207
ATGGGGCAGACGCGA >NR_072314
Benchmark - 8-core VM, CentOS 7.7
62.883 seconds - op's code
33.594 seconds - search_db.pl
6.367 seconds - search_db_mce.pl
4.991 seconds - search_db_chunk.pl
8.093 seconds - search_db_inline_c.pl
1.530 seconds - search_db_inline_c_mce.pl
Benchmark - 8-core VM, Xubuntu 18.04.3
36.684 seconds - search_db.pl
7.403 seconds - search_db_mce.pl
5.986 seconds - search_db_chunk.pl
11.009 seconds - search_db_inline_c.pl
2.188 seconds - search_db_inline_c_mce.pl
Benchmark - 8-core, macOS Mojave 10.14.6
28.857 seconds - search_db.pl
6.130 seconds - search_db_mce.pl
5.219 seconds - search_db_chunk.pl
27.322 seconds - search_db_inline_c.pl
5.808 seconds - search_db_inline_c_mce.pl
This completes the exercise. It boggles my mind comparing CentOS vs. Xubuntu. Ditto, Inline::C running faster on Linux vs. macOS. Testing was done with Perl 5.30.1. The virtualization is handled by VMware Fusion.
Regards, Mario
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.