Category: CGI Programming
Author/Contact Info Robin Turner
Description: This is a CGI app to compare a text against the Academic Word List to give an idea of how academic/technical it is. It will work for uploaded plain text or HTML files, or with URLs, and could be adapted to use any set of word lists. There is a working version here. Note: it uses humungous arrays and is thus rather slow.

use CGI qw/:standard/;
use CGI::Carp('fatalsToBrowser');
$CGI::POST_MAX=1024 * 500; # Prevents really big uploads
use LWP::Simple;
use HTML::TokeParser;

# User-specific data - change this!

# URL of this program
$program_url = '

# Directory to store files
$filebase = '/home/robin/public_html/awlcheck/';

# URL of CSS stylesheet
$css = 'awlcheck.css';

# HTML title - diplayed in browser status bar
$title = "Check files against AWL";

# What's displayed at the top of the page
$heading = "Check files against the Academic Word List";

# Name of main list
$listname = "AWL";

# Generic name of sublists
$sublist = "sublist"; 

# The number of sublists
$number_of_sublists = 10;

# Name of file for General Service List
$gsl = "gsl";

# Main program
print header,

print_form()    unless param;
print_results() if param;

print end_html;
close $file;

# The first page
sub print_form {

    print start_multipart_form(),
    h4("Upload a file from your computer ..."),
    p filefield(-name=>'upload',-size=>70),
            submit(-label=>'Process File'),
    h4("... or type the URL of a web page here."),
    p textfield(-name=>'url',
        submit(-label=>'Process web page'),
        -label=>' Check if you want found words to be printed'),
    p("This program is designed to work on plain text (.txt) or HTML f
+iles. The program will process other file types, but the results will
+ be wrong, so if you have something like a Word document, save the fi
+le as plain text first."),
        p("This program does some serious number-crunching, which can 
+take a while if you have a large file. If you're uploading your PhD t
+hesis, you might want to make dinner while you wait for the results."

# Processing the file
sub print_results {
    my $file = param('upload');
    my $print_words = param('print_words');
    $url = param('url');
    if ($url) { 
        $webpage = get($url);

        my $stream = HTML::TokeParser->new( \$webpage );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            @line = split (/\W+/, $text_string);
            push @words, @line;
    if ($file) {
        my $stream = HTML::TokeParser->new( $file );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            @line = split (/\W+/, $text_string);
            push @words, @line;

    $wordcount = @words;
        # Compare with General Service List
    open (GSL, "$filebase$gsl") or print "Cannot open GSL";
    $common = <GSL>;
    close (GSL);
    foreach (@words){
        $search = $_;
        if ($common =~ /$search/i) {$gslcount++;}

    for ($sl = 1; $sl <= $number_of_sublists; ++$sl) {
        push @lists, 0;
        # Open the appropriate sublist
        open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sub
        while (<SUBLIST>) {
            s/-//g; # Get rid of hyphens
            @sublist = split (/\W+/, $_);
        for ($i = 0; $i <= @words; ++$i) {
            $search = $words[$i];
            foreach (@sublist){
                if ($search eq $_) {
                    $lists[$sl - 1]++;
                    # Collect found words for later printing
                    if ($print_words eq "on"){
                        for ($i2 = 0; $i2 <= @found_words; ++$i2) {
                            if ($found_words[$i2] eq $_) {
                                $already_found = "yes";
                        unless ($already_found eq "yes"){
                            push @found_words, $_;
                    $already_found = "no";

    close (SUBLIST);
    # Check for empty files and avoid illegal division by zero
    if ($wordcount < 1) {
        $wordcount = 1;
        print "WARNING! This file seems to be empty!";
    $frequency = int($totalawl / $wordcount * 100);
    $frequency_gsl = int($gslcount / $wordcount * 1000);
    $remainder = 100 - $frequency;
    if ($frequency < 1) {$perthousand = "Less than 1";} 
    else {$perthousand = $frequency * 10;}
    $ratio = $perthousand / $frequency_gsl * 100;
    if ($ratio > 30) {
        $comment = '"erudite"';
    } elsif  ($ratio > 10) {
        $comment = '"academic"';
    } elsif  ($ratio > 5) {
        $comment = '"literate"';
    } else {$comment = '"colloquial"';}
    # Print the results
    if ($file) {
        print h2("Results for $file");
    } else {
        print h2("Results for $url");
    print h5("Comment: $comment"),
    h5("Total words in file = $wordcount"),
    h5("Common words = $frequency_gsl per thousand"),
    h5("Words in $listname = $perthousand per thousand"),
    for ($i2 = 0; $i2 <= $frequency; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
    print '<p>&nbsp;</p><h4>Breakdown by Sublist</h4><p>';
    # For some reason using commands for this confuses Internet
+ Explorer
    for ($sl = 0; $sl < $number_of_sublists; ++$sl) {
        $frequency = int($lists[$sl] / $wordcount * 1000 + 0.5);    
        $percent = int($lists[$sl] / $totalawl * 100 + 0.5);
        $remainder = 100 - $percent;
        print br;
        for ($i2 = 0; $i2 <= $percent; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        print '&nbsp; Sublist ';
        print $sl+1;
        print ": $percent";
        print '% ';
        print "($lists[$sl] words)";
        print br;
    print "</p>";
    # Print out found words if required
    if ($print_words eq "on"){
        print h3("Words Found");
        @found_words = sort @found_words;
#         for ($i = 0; $i <= @found_words; ++$i) {
            print p("@found_words");
#         }
    print h4 a({-href=>"$program_url"},"Process another file");
    print h4 a({-href=>"awlcheckexplained.html"},"What these results m