my %Items; sub build_test_data { # reproduceable case srand(12345); # Sorted by prevalence. Keyword 'kaa' is way more common than 'kzz'. my @Keywords = 'kaa' ... 'kzz'; # Each node is associated with an asciibetical list of unique keywords. # We groom out the top keywords which are basically noise. for my $xx ('iaa' .. 'izz') { my $count = int(rand(8)) + 4; $Items{$xx}{$Keywords[ int(rand()*rand()*@Keywords) ]}++ while $count--; delete $Items{$xx}{$_} for 'kaa'..'kab'; $Items{$xx} = [ sort keys %{$Items{$xx}} ]; } return unless @_; print Dumper \%Items; # lots of raw data! } build_test_data();