#!/usr/bin/perl use strict; use warnings; use Compress::Zlib; use Getopt::Long; use Pod::Usage; my $VERSION = 0.7; my $dictfile = 'dict.gz'; # Process command-line options my $help = ''; my $man = ''; my $version = ''; my $token_debug = ''; my $glossary_output = ''; my $dictionary_output = ''; GetOptions( 'help|?' => \$help, 'version' => \$version, 'man' => \$man, 'token-debug' => \$token_debug, 'glossary' => \$glossary_output, 'dictionary' => \$dictionary_output ); print "This is version $VERSION of $0.\n" if $version; exit(0) if ($version); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; my $file = shift; my %dictionary = readdict(\$dictfile); my %glossary; findwords(); printlexicon(\%dictionary) if $dictionary_output; printlexicon(\%glossary) if $glossary_output; # Readdict reads in the dictionary file defined above using # the Compress:Zlib CPAN module. It returns a hash that is # used for all further dictionary operations. # sub readdict { my $dict = shift; my %dicthash; my $gz = gzopen($$dict, "rb") or die "Cannot open $$dict: $gzerrno\n" ; while ($gz->gzreadline($_) > 0) { chomp; $dicthash{lc($_)} = 0; } die "Error reading from $$dict: $gzerrno\n" if $gzerrno != Z_STREAM_END ; return %dicthash; } # findwords() reads in a file and compares words found in the file # with the contents of the dictionary read in by the readdict # function. It assigns counts to the elements of %dictionary and # creates %glossary elements and increases its values according to # the number of matches. # sub findwords { open my $if, "<", $file || die "Could not open $file: $!"; while (<$if>) { chomp; my @elements = split(/[ ']/,$_); foreach my $element (@elements) { next if $element =~ /[^A-Za-zĄ-’]/; # Don't need digits $element = lc($element); $element =~ s/[\s\,\!\?\.\-\_\;\)\(\"\']//g; next if $element eq ''; print "[$element]\n" if $token_debug; # If the word matches a word in the dictionary, increase # the match count by one, otherwise assign it to the # glossary of words not found in the dictionary and up # the glossary count. if ( exists $dictionary{$element} ) { $dictionary{$element}++; } else { $glossary{$element}++; } } } } # Showmatches reads in a lexicon hash via a reference and prints all words out # that have been seen in the findwords() function along with a frequency count. # sub printlexicon { my $lexicon = shift; my $counter = 0; foreach my $key (sort keys %$lexicon) { if ( $$lexicon{$key} > 0 ) { print $key . " : " . $$lexicon{$key} . "\n"; $counter++; } } print "\n$counter entries total\n"; } __END__ =pod =head1 dict-compare A generic script for building dictionaries by comparing them to real-world texts. =head1 SYNOPSIS C output_file> =head1 DESCRIPTION This program compares the words in a given text file to a list of words from a dictionary file. It is capable of outputting lists of words that occur or do not occur in a given dictionary file, along with their frequency in the text. Debugging output using token tag marks is also available. =head2 Command-Line Options =over 12 =item C<--help,-h,-?> Prints a usage help screen. =item C<--man,-m> Prints out the manual entry for $0 =item C<--version,-v> Prints out the program version. =item C<--glossary> Prints a glossary of words not found in the dictionary file and the number of times they occur. =item C<--dictionary> Prints out the words from the text that had a dictionary match, along with their respective frequencies. =item C<--token-debug> Prints tags around each token in the text to help sound out strange tokens. =back =head1 EXAMPLE C This command reads in the text contained in myfile.txt and prints out a list of words not found in the dictionary and their frequencies. =back =head1 AUTHOR Damon "allolex" Davison - =head1 LICENSE This code is released under the same terms as Perl itself. =cut