#!/usr/bin/perl

use strict;
use warnings;
use Compress::Zlib;
use Getopt::Long;
use Pod::Usage;

my $VERSION = 0.7;
my $dictfile = 'dict.gz';

#  Process command-line options

my $help = '';
my $man = '';
my $version = '';
my $token_debug = '';
my $glossary_output = '';
my $dictionary_output = '';

GetOptions( 'help|?' => \$help, 'version' => \$version, 'man' => \$man, 'token-debug' => \$token_debug, 'glossary' => \$glossary_output, 'dictionary' => \$dictionary_output );

print "This is version $VERSION of $0.\n" if $version;
exit(0) if ($version);
pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;

my $file = shift;
my %dictionary = readdict(\$dictfile);
my %glossary;

findwords();

printlexicon(\%dictionary) if $dictionary_output;
printlexicon(\%glossary) if $glossary_output;


#  Readdict reads in the dictionary file defined above using
#  the Compress:Zlib CPAN module.  It returns a hash that is
#  used for all further dictionary operations.
#
sub readdict {
	my $dict = shift;
	my %dicthash;

	my $gz = gzopen($$dict, "rb") or die "Cannot open $$dict: $gzerrno\n" ;
	while ($gz->gzreadline($_) > 0) {
		chomp;
		$dicthash{lc($_)} = 0;
	}
	die "Error reading from $$dict: $gzerrno\n" if $gzerrno != Z_STREAM_END ;
	return %dicthash;
}

#  findwords() reads in a file and compares words found in the file
#  with the contents of the dictionary read in by the readdict
#  function.  It assigns counts to the elements of %dictionary and
#  creates %glossary elements and increases its values according to
#  the number of matches.
#
sub findwords {
	open my $if, "<", $file || die "Could not open $file: $!";
	while (<$if>) {
		chomp;
		my @elements = split(/[ ']/,$_);
		foreach my $element (@elements) {
			next if $element =~ /[^A-Za-zÀ-ÿ]/; #  Don't need digits
			$element = lc($element);
			$element =~ s/[\s\,\!\?\.\-\_\;\)\(\"\']//g;
			next if $element eq '';
			print "[$element]\n" if $token_debug;
			#  If the word matches a word in the dictionary, increase
			#  the match count by one, otherwise assign it to the
			#  glossary of words not found in the dictionary and up
			#  the glossary count.
			if ( exists $dictionary{$element} ) {
				$dictionary{$element}++;
			} else {
				$glossary{$element}++;
			}
		}
	}
}

#  Showmatches reads in a lexicon hash via a reference and prints all words out 
#  that have been seen in the findwords() function along with a frequency count.
#
sub  printlexicon {
	my $lexicon = shift;
	my $counter = 0;
	foreach my $key (sort keys %$lexicon) {
		if ( $$lexicon{$key} > 0 ) {
			print $key . " : " . $$lexicon{$key} . "\n";
			$counter++;
		}
	}
	print "\n$counter entries total\n";
}

__END__

=pod

=head1 dict-compare

A generic script for building dictionaries by comparing them to real-world texts.

=head1 SYNOPSIS

C<dict-compare [--glossary --dictionary] [--token-debug] file > output_file>

=head1 DESCRIPTION

This program compares the words in a given text file to a list of words from
a dictionary file.  It is capable of outputting lists of words that occur or
do not occur in a given dictionary file, along with their frequency in the
text.  Debugging output using token tag marks is also available.

=head2 Command-Line Options

=over 12

=item C<--help,-h,-?>

Prints a usage help screen.

=item C<--man,-m>

Prints out the manual entry for $0

=item C<--version,-v>

Prints out the program version.

=item C<--glossary>

Prints a glossary of words not found in the dictionary file and the number of
times they occur.

=item C<--dictionary>

Prints out the words from the text that had a dictionary match, along with
their respective frequencies.

=item C<--token-debug>

Prints tags around each token in the text to help sound out strange tokens.

=back

=head1 EXAMPLE

C<dict-compare --glossary myfile.txt>

This command reads in the text contained in myfile.txt and prints out a list
of words not found in the dictionary and their frequencies.

=back

=head1 AUTHOR

Damon "allolex" Davison - <allolex@sdf.freeshell.org>

=head1 LICENSE

This code is released under the same terms as Perl itself.

=cut