I'm surprised that essentially no one has commented on this mammoth of a post in almost a month. I'll give it a shot.
I decided to try to "precalculate enough information to allow each question to be solveable in polynomial time." My approach is heavily influenced by yours, particularly the part about "not all words need be considered" (e.g., "screwdriver" contains all of the letters "disc" does).
Since you already used a language other than Perl, I decided to use C++. At the end of this post, I have included two programs, one for precomputation and one for queries. The first takes in a dictionary file and produces a "digest" file -- a list of all useful singles, doubles, triples, and quadruples (at 18_341 lines and 394K, it's smaller than the dictionary). The other one answers questions based on it, assuming the digest file is "digest.txt". As with your Java code, please excuse idiosyncrasies in my C++ code; over time, I've developed a mostly consistent, but perhaps a tad strange, coding style. (In particular, I can get rid of the pointers in this code, but I just think it's easier to read this way.)
Of obvious interest is how fast my code runs. Here's the summary of running the precomputation code on my computer, which will hopefully also help explain the approach: (Incidentally, I believe it takes between a gig and two of RAM, slightly more than your requirements -- I could knock that down if I wanted, but I didn't feel the need.)
% time ./challenge < words.txt > digest.txt
Reading in single words.
61406 single words.
Finding all useful single words.
3477 useful single words.
Listing all pairs of words.
6043026 total pairs.
Finding all useful pairs.
14574 useful pairs.
Listing all triples of words.
50644650 total triples.
Finding all useful triples.
289 useful triples.
Finding a single universal quadruple.
Universal quadruple found.
952.27u 3.88s 15:57.57 99.8%
I also tried running all 67_108_863 = 2**26 - 1 different combinations through my query answerer. It took a total of 42 minutes, for an average rate of ~25,000 queries per second.
Overall summary is that coding took about two hours from scratch, precomputation took 16 minutes, and query answering is definitely fast enough. Enjoy!
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
using namespace std;
// Combination consists of chosen words and present letters bitfield
struct combination {
unsigned short a, b, c, d;
unsigned long present;
combination* next; // Next non-useless combination (see find_usele
+ss)
};
bool operator < ( const combination& u, const combination& v ) {
return u.present > v.present;
}
const short NONE = -1; // I'm aware of the signed -> unsigned conversi
+on
const unsigned long ALL = (1 << 26) - 1;
int optimize( int );
void find_useless( vector<combination>&, vector<combination>& );
int main( ) {
int i, j;
vector<string> words;
vector<combination> one, onep, two, twop, three, threep, four;
// Read input, compute "present" bitfields
cerr << "Reading in single words." << endl;
string word;
for( i = 0; cin >> word; i++ ) {
combination c = {i, NONE, NONE, NONE, 0, NULL};
for( j = 0; j < word.length(); j++ ) {
if( word[j] >= 'A' && word[j] <= 'Z' )
word[j] += 'a' - 'A';
if( word[j] < 'a' || word[j] > 'z' )
continue;
c.present |= 1 << (word[j] - 'a');
}
words.push_back( word );
one .push_back( c );
}
cerr << one.size() << " single words." << endl;
// Find useless words
cerr << "Finding all useful single words." << endl;
find_useless( one, onep );
cerr << onep.size() << " useful single words." << endl;
one = vector<combination>();
for( i = 0; i < onep.size(); i++ )
cout << words[onep[i].a] << "\n";
// Compute all doubles now
cerr << "Listing all pairs of words.\n";
for( i = 0; i < onep.size(); i++ ) {
for( j = i+1; j < onep.size(); j++ ) {
combination c = {onep[i].a, onep[j].a, NONE, NONE, onep[i].pre
+sent | onep[j].present, NULL};
two.push_back( c );
}
}
cerr << two.size() << " total pairs." << endl;
// Find useless two-word combinations
cerr << "Finding all useful pairs." << endl;
find_useless( two, twop );
cerr << twop.size() << " useful pairs." << endl;
two = vector<combination>();
for( i = 0; i < twop.size(); i++ )
cout << words[twop[i].a] << " " << words[twop[i].b] << "\n";
// Compute all triples now
cerr << "Listing all triples of words." << endl;
for( i = 0; i < onep.size(); i++ ) {
for( j = 0; j < twop.size(); j++ ) {
if( onep[i].a == twop[j].a || onep[i].a == twop[j].b )
continue;
combination c = {onep[i].a, twop[j].a, twop[j].b, NONE, onep[i
+].present | twop[j].present, NULL};
three.push_back( c );
}
}
cerr << three.size() << " total triples." << endl;
// Find useless three-word combinations
cerr << "Finding all useful triples.\n";
find_useless( three, threep );
cerr << threep.size() << " useful triples.\n";
three = vector<combination>();
for( i = 0; i < threep.size(); i++ )
cout << words[threep[i].a] << " " << words[threep[i].b] << " " <<
+words[threep[i].c] << "\n";
// Find four-letter combination spanning everything
cerr << "Finding a single universal quadruple.\n";
for( i = 0; i < twop.size(); i++ ) {
for( j = 0; j < twop.size(); j++ ) {
if( (twop[i].present | twop[j].present) == ALL )
break;
}
if( j < twop.size() )
break;
}
if( i < twop.size() ) {
cout << words[twop[i].a] << " " << words[twop[i].b] << " "
<< words[twop[j].a] << " " << words[twop[j].b] << "\n";
cerr << "Universal quadruple found." << endl;
} else {
cerr << "Universal quadruple not found." << endl;
}
return 0;
}
void find_useless( vector<combination>& in, vector<combination>& out )
+ {
int i;
// Sort (decreasing) by present
sort( in.begin(), in.end() );
// String together nexts
for( i = 0; i+1 < in.size(); i++ )
in[i].next = &in[i+1];
// Find useless words
combination *c, *d;
for( c = &in[0]; c->next != NULL; c = c->next ) {
for( d = c; d->next != NULL; ) {
if( (d->next->present & ~c->present) == 0 ) {
d->next = d->next->next;
} else {
d = d->next;
}
}
if( c->next == NULL )
break;
}
// Output
for( c = &in[0]; c != NULL; c = c->next )
out.push_back( *c );
}
#include <string>
#include <vector>
#include <fstream>
#include <iostream>
using namespace std;
struct datum {
string combination;
unsigned long present;
};
unsigned long present( string s ) {
int j;
unsigned long present = 0;
for( j = 0; j < s.length(); j++ ) {
if( s[j] >= 'A' && s[j] <= 'Z' )
s[j] += 'a' - 'A';
if( s[j] < 'a' || s[j] > 'z' )
continue;
present |= 1 << (s[j] - 'a');
}
return present;
}
int main( ) {
int i;
ifstream in( "digest.txt" );
vector<datum> data;
// Read combinations from digest
string line;
for( i = 0; getline( in, line ); i++ ) {
datum d = { line, present( line ) };
data.push_back( d );
}
// Answer queries
while( getline( cin, line ) ) {
unsigned long query = present( line );
for( i = 0; i < data.size(); i++ ) {
if( (query & ~data[i].present) == 0 )
break;
}
cout << data[i].combination << endl;
}
return 0;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.