http://qs321.pair.com?node_id=363122
Category: CGI Programming
Author/Contact Info Robin Turner robin@bilkent.edu.tr
Description: This is a CGI app to compare a text against the Academic Word List to give an idea of how academic/technical it is. It will work for uploaded plain text or HTML files, or with URLs, and could be adapted to use any set of word lists. There is a working version here. Note: it uses humungous arrays and is thus rather slow.
#!/usr/local/bin/perl

use CGI qw/:standard/;
use CGI::Carp('fatalsToBrowser');
$CGI::POST_MAX=1024 * 500; # Prevents really big uploads
use LWP::Simple;
use HTML::TokeParser;

# User-specific data - change this!

# URL of this program
$program_url = 'http://lists.bilkent.edu.tr/~robin/cgibin/awlcheck.cgi
+';

# Directory to store files
$filebase = '/home/robin/public_html/awlcheck/';

# URL of CSS stylesheet
$css = 'awlcheck.css';

# HTML title - diplayed in browser status bar
$title = "Check files against AWL";

# What's displayed at the top of the page
$heading = "Check files against the Academic Word List";

# Name of main list
$listname = "AWL";

# Generic name of sublists
$sublist = "sublist"; 

# The number of sublists
$number_of_sublists = 10;

# Name of file for General Service List
$gsl = "gsl";


# Main program
print header,
start_html(-title=>"$title",
    -style=>{-src=>$css});

print_form()    unless param;
print_results() if param;

print end_html;
close $file;


# The first page
sub print_form {

    print start_multipart_form(),
    h2("$heading"),
    h4("Upload a file from your computer ..."),
    p filefield(-name=>'upload',-size=>70),
            submit(-label=>'Process File'),
    p,
    h4("... or type the URL of a web page here."),
    p textfield(-name=>'url',
        -size=>70,
        -maxlength=>80),
        submit(-label=>'Process web page'),
    p,
    checkbox(-name=>'print_words',
        -label=>' Check if you want found words to be printed'),
    p("This program is designed to work on plain text (.txt) or HTML f
+iles. The program will process other file types, but the results will
+ be wrong, so if you have something like a Word document, save the fi
+le as plain text first."),
    
        p("This program does some serious number-crunching, which can 
+take a while if you have a large file. If you're uploading your PhD t
+hesis, you might want to make dinner while you wait for the results."
+);
    
    end_form;
}

# Processing the file
sub print_results {
    
    
    my $file = param('upload');
    my $print_words = param('print_words');
    $url = param('url');
    if ($url) { 
        
        $webpage = get($url);

        my $stream = HTML::TokeParser->new( \$webpage );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            
            @line = split (/\W+/, $text_string);
            push @words, @line;
        }        
         
    }
    
    
    if ($file) {
        my $stream = HTML::TokeParser->new( $file );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            
            @line = split (/\W+/, $text_string);
            push @words, @line;
        }
    }

    $wordcount = @words;
    
    
    
        # Compare with General Service List
    open (GSL, "$filebase$gsl") or print "Cannot open GSL";
    undef $INPUT_RECORD_SEPARATOR;
    $common = <GSL>;
    close (GSL);
    
    foreach (@words){
        $search = $_;
        if ($common =~ /$search/i) {$gslcount++;}
    }
    

    for ($sl = 1; $sl <= $number_of_sublists; ++$sl) {
        push @lists, 0;
        # Open the appropriate sublist
        open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sub
+list";
    
        while (<SUBLIST>) {
            s/-//g; # Get rid of hyphens
            @sublist = split (/\W+/, $_);
        }        
        for ($i = 0; $i <= @words; ++$i) {
            $search = $words[$i];
            
            foreach (@sublist){
                
                if ($search eq $_) {
                    $lists[$sl - 1]++;
                    $totalawl++; 
                    # Collect found words for later printing
                    if ($print_words eq "on"){
                        for ($i2 = 0; $i2 <= @found_words; ++$i2) {
                            if ($found_words[$i2] eq $_) {
                                $already_found = "yes";
                            }
                        }
                        unless ($already_found eq "yes"){
                            push @found_words, $_;
                        }
                    $already_found = "no";
                    }    
                }
            
                
            }
            
        }

    close (SUBLIST);
    }
    
    # Check for empty files and avoid illegal division by zero
    if ($wordcount < 1) {
        $wordcount = 1;
        print "WARNING! This file seems to be empty!";
    }
    
    
    $frequency = int($totalawl / $wordcount * 100);
    $frequency_gsl = int($gslcount / $wordcount * 1000);
    $remainder = 100 - $frequency;
    
    
    if ($frequency < 1) {$perthousand = "Less than 1";} 
    else {$perthousand = $frequency * 10;}
    
    $ratio = $perthousand / $frequency_gsl * 100;
    
    if ($ratio > 30) {
        $comment = '"erudite"';
    } elsif  ($ratio > 10) {
        $comment = '"academic"';
    } elsif  ($ratio > 5) {
        $comment = '"literate"';
    } else {$comment = '"colloquial"';}
    
    # Print the results
    if ($file) {
        print h2("Results for $file");
    } else {
        print h2("Results for $url");
    }
        
    print h5("Comment: $comment"),
    h5("Total words in file = $wordcount"),
    h5("Common words = $frequency_gsl per thousand"),
    h5("Words in $listname = $perthousand per thousand"),
    p;
    
    for ($i2 = 0; $i2 <= $frequency; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
    print '<p>&nbsp;</p><h4>Breakdown by Sublist</h4><p>';
    # For some reason using CGI.pm commands for this confuses Internet
+ Explorer
    
    
    for ($sl = 0; $sl < $number_of_sublists; ++$sl) {
        $frequency = int($lists[$sl] / $wordcount * 1000 + 0.5);    
        $percent = int($lists[$sl] / $totalawl * 100 + 0.5);
        $remainder = 100 - $percent;
        print br;
        for ($i2 = 0; $i2 <= $percent; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
            
        }
        
        
        print '&nbsp; Sublist ';
        print $sl+1;
        print ": $percent";
        print '% ';
        print "($lists[$sl] words)";
        print br;
    }
    print "</p>";
    # Print out found words if required
    if ($print_words eq "on"){
        print h3("Words Found");
        @found_words = sort @found_words;
    
#         for ($i = 0; $i <= @found_words; ++$i) {
            print p("@found_words");
#         }
    }    
    
    restore_parameters();
    
    print h4 a({-href=>"$program_url"},"Process another file");
    print h4 a({-href=>"awlcheckexplained.html"},"What these results m
+ean");
    
    
}