For any comparative linguists, here is what it looks like in Python (it even works):
import re
import sys
from collections import defaultdict
wordsInOrder = []
for line in sys.stdin:
wordsInOrder.extend( re.findall(r'\w+', line.lower()) )
single = defaultdict(int)
double = defaultdict(int)
triple = defaultdict(int)
for i, word in enumerate(wordsInOrder):
single[word] += 1
try:
next_word = wordsInOrder[i+1]
double[word + ' ' + next_word] += 1
next_next_word = wordsInOrder[i+2]
triple[word + ' ' + next_word + ' ' + next_next_word] += 1
except:
pass
def sort_by_frequency(d):
return sorted(d.iterkeys(), cmp = lambda x,y: cmp(d[y], d[x]))
for singlet in sort_by_frequency(single):
print singlet
for doublet in sort_by_frequency(double):
if not doublet.startswith(singlet + ' '):
continue
print "\t", doublet
for triplet in sort_by_frequency(triple):
if not triplet.startswith(doublet + ' '):
continue
print "\t\t", triplet
I needed
amusement ;-)