use warnings;
use strict;
use KinoSearch::Plan::Schema;
use Time::HiRes qw( tv_interval gettimeofday );
my $schema = KinoSearch::Plan::Schema->new;
my $string_type = KinoSearch::Plan::StringType->new;
my $store_only = KinoSearch::Plan::StringType->new( indexed => 0 );
$schema->spec_field( name => "term", type => $string_type );
$schema->spec_field( name => "number", type => $store_only );
my $index_dir = "./ks-index";
unless ( -d $index_dir )
{
my $data_file = shift || die "Give me a data file!\n";
mkdir $index_dir or die $!;
my $indexer = KinoSearch::Index::Indexer->new(
index => $index_dir,
schema => $schema,
create => ! -d $index_dir,
# Truncate each run or duplicate content.
truncate => 1,
);
open my $data, "<", $data_file
or die "Couldn't open $data_file to read: $!";
while (<$data>)
{
my ( $term, $number ) = split /\s+/;
$indexer->add_doc({
term => $term,
number => $number,
});
}
$indexer->commit;
}
print "I'm going to search as long as you give me input...\n";
my $searcher = KinoSearch::Search::IndexSearcher
->new( index => $index_dir );
while (<STDIN>)
{
chomp;
my $t0 = [gettimeofday];
my $hits = $searcher->hits(
query => $_,
offset => 0,
num_wanted => 5,
);
while ( my $hit = $hits->next )
{
printf "%20s --> %d\n", $hit->{term}, $hit->{number};
}
printf qq{Found %d matches looking for "%s"\n}, $hits->total_hits,
+ $_;
printf "Search took %.3f seconds\n", tv_interval( $t0, [gettimeofd
+ay] );
}
exit 0;
Using this dataset (one million records)-
perl -Minteger -le 'printf"text%d\t%d\n",rand(100), rand(100) for 1 ..
+ 1_000_000'
It scales extremely well. It seems like it's not a conceptual or obvious match for your problem space but it has the goods and might be exactly what you need.