#!/usr/bin/env perl use strict; use warnings; use autodie; use Lingua::StopWords 'getStopWords'; use Text::CSV; my ($lang, $encoding) = qw{en UTF-8}; my %word_re_for = ( en => qr{^.*?\b([\p{Alnum}']*[\p{Alnum}]+).*$}, ); my ($in_file, $out_file) = qw{test_input.txt test_output.csv}; my $is_stop = _mod_stops(getStopWords($lang, $encoding)); my %count_for; { open my $fh, '<:encoding(UTF-8)', $in_file; while (<$fh>) { TOKEN: for my $token (split) { next TOKEN unless $token =~ $word_re_for{$lang}; my $word = lc $1; next TOKEN if $is_stop->{$word}; ++$count_for{$word}; } } } { open my $fh, '>:encoding(UTF-8)', $out_file; my $csv = Text::CSV::->new({sep_char => "\t", binary => 1}); $csv->say($fh, [$_, $count_for{$_}]) for sort keys %count_for; } sub _mod_stops { my ($stops) = @_; my @adds = qw{thou thee thy thine u ur}; my @dels = qw{very same}; $stops->{$_} = 1 for @adds; delete @$stops{@dels}; return $stops; }