I'm boostering for KinoSearch because I think it's undervalued or perhaps just not known well enough.
use warnings;
use strict;
use KinoSearch::Plan::Schema;
use Time::HiRes qw( tv_interval gettimeofday );
my $schema = KinoSearch::Plan::Schema->new;
my $string_type = KinoSearch::Plan::StringType->new;
my $store_only = KinoSearch::Plan::StringType->new( indexed => 0 );
$schema->spec_field( name => "term", type => $string_type );
$schema->spec_field( name => "number", type => $store_only );
my $index_dir = "./ks-index";
unless ( -d $index_dir )
{
my $data_file = shift || die "Give me a data file!\n";
mkdir $index_dir or die $!;
my $indexer = KinoSearch::Index::Indexer->new(
index => $index_dir,
schema => $schema,
create => ! -d $index_dir,
# Truncate each run or duplicate content.
truncate => 1,
);
open my $data, "<", $data_file
or die "Couldn't open $data_file to read: $!";
while (<$data>)
{
my ( $term, $number ) = split /\s+/;
$indexer->add_doc({
term => $term,
number => $number,
});
}
$indexer->commit;
}
print "I'm going to search as long as you give me input...\n";
my $searcher = KinoSearch::Search::IndexSearcher
->new( index => $index_dir );
while (<STDIN>)
{
chomp;
my $t0 = [gettimeofday];
my $hits = $searcher->hits(
query => $_,
offset => 0,
num_wanted => 5,
);
while ( my $hit = $hits->next )
{
printf "%20s --> %d\n", $hit->{term}, $hit->{number};
}
printf qq{Found %d matches looking for "%s"\n}, $hits->total_hits,
+ $_;
printf "Search took %.3f seconds\n", tv_interval( $t0, [gettimeofd
+ay] );
}
exit 0;
Using this dataset (one million records)-
perl -Minteger -le 'printf"text%d\t%d\n",rand(100), rand(100) for 1 ..
+ 1_000_000'
That code gives these results (on a fairly modest *nix box)-
I'm going to search as long as you give me input...
text13
text13 --> 31
text13 --> 22
text13 --> 69
text13 --> 81
text13 --> 96
Found 10044 matches looking for "text13"
Search took 0.002 seconds
text99
text99 --> 66
text99 --> 76
text99 --> 11
text99 --> 59
text99 --> 26
Found 9964 matches looking for "text99"
Search took 0.002 seconds
text100
Found 0 matches looking for "text100"
Search took 0.000 seconds
It scales extremely well. It seems like it's not a conceptual or obvious match for your problem space but it has the goods and might be exactly what you need.