Make the "variants" an array and push items onto it. Then sift out unique values once all the data has been read.
use strict;
use warnings;
use Data::Dumper;
open my $inFH, q{<}, \ <<EOD or die $!;
The DT the
International NN International
for IN for
well NN well
preparation NN preparation
preparation NN preparation
in IN in
conference NN conference
conference NN conference
conferences NN conference
good VVG good
EOD
do { my $discard = <$inFH> };
my %hash;
while ( <$inFH> )
{
my @tags = split;
next unless $tags[ 1 ] eq q{NN};
$hash{ $tags[ 2 ] }->{ frequency } ++;
push @{ $hash{ $tags[ 2 ] }->{ variants } }, $tags[ 0 ];
}
@{ $hash{ $_ }->{ variants } } = do {
my %seen;
grep { not $seen{ $_ } ++ } @{ $hash{ $_ }->{ variants } };
} for keys %hash;
print Data::Dumper->Dumpxs( [ \ %hash ], [ qw{ *hash } ] );
The output.
%hash = (
'preparation' => {
'frequency' => 2,
'variants' => [
'preparation'
]
},
'conference' => {
'frequency' => 3,
'variants' => [
'conference',
'conferences'
]
},
'well' => {
'frequency' => 1,
'variants' => [
'well'
]
},
'International' => {
'variants' => [
'International'
],
'frequency' => 1
}
);
I hope this is helpful.
Update: Perhaps simpler would be to keep a ->{ seen }->{ $tags[ 0 ] } sub-sub-HoH to filter out duplicates and delete it at the end.
...
while ( <$inFH> )
{
my @tags = split;
next unless $tags[ 1 ] eq q{NN};
$hash{ $tags[ 2 ] }->{ frequency } ++;
push @{ $hash{ $tags[ 2 ] }->{ variants } }, $tags[ 0 ]
unless $hash{ $tags[ 2 ] }->{ seen }->{ $tags[ 0 ] } ++;
}
delete $hash{ $_ }->{ seen } for keys %hash;
...
|