#!/usr/bin/perl use strict; use warnings; use Sort::Naturally qw(nsort); use MCE; # This program reads an abstract sentence file and produces # output with the following format ... if ($#ARGV != 1) { print "usage: $0 \n"; } my $inputfile1 = $ARGV[0]; my $outputfile = $ARGV[1]; unless (-e $inputfile1) { die "Can't open $inputfile1: No such file or directory"; } # Gather routine for the manager process. my %hashunique; sub gather { my ($hashref) = @_; for my $k1 (keys %{$hashref}) { for my $k2 (keys %{$hashref->{$k1}}) { $hashunique{$k1}{$k2} = undef; } } } # The user function for MCE workers. Workers open a file handle to # a scalar ref due to using MCE option use_slurpio => 1. sub user_func { my ($mce, $slurp_ref, $chunk_id) = @_; my %localunique; open RF, '<', $slurp_ref; # A shared-hash is not necessary. The gist of it all is batching # to a local hash. Otherwise, a shared-hash inside a loop involves # high IPC overhead. local $/ = ''; # blank line, paragraph break # in the event worker receives 2 or more records while () { my @lines = split /\n/, $_; # my ($indexofdashinarray) = grep { $lines[$_] =~ /\-\-/ } 0..$#lines; for my $i (1..$#lines) { next if $lines[$i] eq '--'; while ($lines[$i] =~ m/(?:\b)D\*(.*?)\*(.*?)\*D(?:\b)/g) { $localunique{"D$1"}{$2} = undef; } } } close RF; # Call gather outside the loop. MCE->gather(\%localunique); } # Am using the core MCE API. Workers read the input file directly and # sequentially, one worker at a time. my $mce = MCE->new( max_workers => 4, input_data => $inputfile1, chunk_size => 1 * 1024 * 1024, # 1 MiB RS => '', # important, blank line, paragraph break gather => \&gather, user_func => \&user_func, use_slurpio => 1 ); $mce->run(); # Results. open WF, ">", $outputfile or die "Can't open $outputfile: $!"; foreach my $k (nsort keys %hashunique) { $hashunique{$k} = join '|', sort(keys %{$hashunique{$k}}); print WF "$k=>$hashunique{$k}\n"; } close WF;