use strict; use warnings; use Bit::Vector::Overload; use List::Util 'shuffle'; my $MIN = 3; ## Some keywords my @keywords = qw[ zero one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen ]; ## Generate some test data my %items = map{ $_ => [ @keywords[ ( shuffle( 0 .. $#keywords ) )[ 0 .. rand @keywords ] ] ] } 'a' .. 'z'; print "The item list:\n"; print "$_ => @{ $items{ $_ } }\n" for sort keys %items; print '=' x 30, "\n"; # First, build an index of the distinct values by building a hash # (anonymous) and taking the keys my @val_index = keys %{{map {($_ => undef)} map {@$_} values %items}}; # and a reverse lookup my %rev_val_index = map {($val_index[$_] => $_)} 0..$#val_index; # Now represent each entry as a bit vector my %vectors; while (my ($k, $v) = each %items) { $vectors{$k} = new Bit::Vector(scalar(@val_index)); $vectors{$k}->Bit_On($_) for @rev_val_index{@$v}; } # Compare elements pairwise and add each element # to an AoHoH indexed by size of tuple and tuple member list # if they have elements in common my @intersections; my @item_keys = keys %items; for my $i (0..$#item_keys-1) { for my $j ($i+1..$#item_keys) { my $intersect = $vectors{$item_keys[$i]} & $vectors{$item_keys[$j]}; my @common_elements = @val_index[$intersect->Index_List_Read]; next if @common_elements < $MIN; my $name_list = join ' ', @common_elements; @{$intersections[scalar @common_elements]{$name_list}}{@item_keys[$i,$j]} = (); # Include any higher-order matches in lower-order matches for my $k (0..$#item_keys) { next if $k == $i or $k == $j; my $new_isect = $intersect & $vectors{$item_keys[$k]}; if ($new_isect eq $intersect) { $intersections[scalar @common_elements]{$name_list}{$item_keys[$k]} = (); } } } } for ($MIN..$#intersections) { next unless keys %{$intersections[$_]}; print "=== $_-tuples of common elements:\n"; while (my ($k, $v) = each %{$intersections[$_]}) { print "$k: ", join(', ', sort keys %$v), "\n"; } }