#!/usr/bin/env perl

use strict;
use warnings;
use autodie;

use Lingua::StopWords 'getStopWords';
use Text::CSV;

my ($lang, $encoding) = qw{en UTF-8};
my %word_re_for = (
    en => qr{^.*?\b([\p{Alnum}']*[\p{Alnum}]+).*$},
);
my ($in_file, $out_file) = qw{test_input.txt test_output.csv};
my $is_stop = _mod_stops(getStopWords($lang, $encoding));
my %count_for;

{
    open my $fh, '<:encoding(UTF-8)', $in_file;

    while (<$fh>) {
        TOKEN: for my $token (split) {
            next TOKEN unless $token =~ $word_re_for{$lang};
            my $word = lc $1;
            next TOKEN if $is_stop->{$word};
            ++$count_for{$word};
        }
    }
}

{
     open my $fh, '>:encoding(UTF-8)', $out_file;
     my $csv = Text::CSV::->new({sep_char => "\t", binary => 1});
     $csv->say($fh, [$_, $count_for{$_}]) for sort keys %count_for;
}

sub _mod_stops {
    my ($stops) = @_;

    my @adds = qw{thou thee thy thine u ur};
    my @dels = qw{very same};
    $stops->{$_} = 1 for @adds;
    delete @$stops{@dels};

    return $stops;
}