#!/usr/local/bin/perl
use CGI qw/:standard/;
use CGI::Carp('fatalsToBrowser');
$CGI::POST_MAX=1024 * 500; # Prevents really big uploads
use LWP::Simple;
use HTML::TokeParser;
# User-specific data - change this!
# URL of this program
$program_url = 'http://lists.bilkent.edu.tr/~robin/cgibin/awlcheck.cgi
+';
# Directory to store files
$filebase = '/home/robin/public_html/awlcheck/';
# URL of CSS stylesheet
$css = 'awlcheck.css';
# HTML title - diplayed in browser status bar
$title = "Check files against AWL";
# What's displayed at the top of the page
$heading = "Check files against the Academic Word List";
# Name of main list
$listname = "AWL";
# Generic name of sublists
$sublist = "sublist";
# The number of sublists
$number_of_sublists = 10;
# Name of file for General Service List
$gsl = "gsl";
# Main program
print header,
start_html(-title=>"$title",
-style=>{-src=>$css});
print_form() unless param;
print_results() if param;
print end_html;
close $file;
# The first page
sub print_form {
print start_multipart_form(),
h2("$heading"),
h4("Upload a file from your computer ..."),
p filefield(-name=>'upload',-size=>70),
submit(-label=>'Process File'),
p,
h4("... or type the URL of a web page here."),
p textfield(-name=>'url',
-size=>70,
-maxlength=>80),
submit(-label=>'Process web page'),
p,
checkbox(-name=>'print_words',
-label=>' Check if you want found words to be printed'),
p("This program is designed to work on plain text (.txt) or HTML f
+iles. The program will process other file types, but the results will
+ be wrong, so if you have something like a Word document, save the fi
+le as plain text first."),
p("This program does some serious number-crunching, which can
+take a while if you have a large file. If you're uploading your PhD t
+hesis, you might want to make dinner while you wait for the results."
+);
end_form;
}
# Processing the file
sub print_results {
my $file = param('upload');
my $print_words = param('print_words');
$url = param('url');
if ($url) {
$webpage = get($url);
my $stream = HTML::TokeParser->new( \$webpage );
while (my $token = $stream->get_token) {
$text_string = $stream->get_text();
@line = split (/\W+/, $text_string);
push @words, @line;
}
}
if ($file) {
my $stream = HTML::TokeParser->new( $file );
while (my $token = $stream->get_token) {
$text_string = $stream->get_text();
@line = split (/\W+/, $text_string);
push @words, @line;
}
}
$wordcount = @words;
# Compare with General Service List
open (GSL, "$filebase$gsl") or print "Cannot open GSL";
undef $INPUT_RECORD_SEPARATOR;
$common = <GSL>;
close (GSL);
foreach (@words){
$search = $_;
if ($common =~ /$search/i) {$gslcount++;}
}
for ($sl = 1; $sl <= $number_of_sublists; ++$sl) {
push @lists, 0;
# Open the appropriate sublist
open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sub
+list";
while (<SUBLIST>) {
s/-//g; # Get rid of hyphens
@sublist = split (/\W+/, $_);
}
for ($i = 0; $i <= @words; ++$i) {
$search = $words[$i];
foreach (@sublist){
if ($search eq $_) {
$lists[$sl - 1]++;
$totalawl++;
# Collect found words for later printing
if ($print_words eq "on"){
for ($i2 = 0; $i2 <= @found_words; ++$i2) {
if ($found_words[$i2] eq $_) {
$already_found = "yes";
}
}
unless ($already_found eq "yes"){
push @found_words, $_;
}
$already_found = "no";
}
}
}
}
close (SUBLIST);
}
# Check for empty files and avoid illegal division by zero
if ($wordcount < 1) {
$wordcount = 1;
print "WARNING! This file seems to be empty!";
}
$frequency = int($totalawl / $wordcount * 100);
$frequency_gsl = int($gslcount / $wordcount * 1000);
$remainder = 100 - $frequency;
if ($frequency < 1) {$perthousand = "Less than 1";}
else {$perthousand = $frequency * 10;}
$ratio = $perthousand / $frequency_gsl * 100;
if ($ratio > 30) {
$comment = '"erudite"';
} elsif ($ratio > 10) {
$comment = '"academic"';
} elsif ($ratio > 5) {
$comment = '"literate"';
} else {$comment = '"colloquial"';}
# Print the results
if ($file) {
print h2("Results for $file");
} else {
print h2("Results for $url");
}
print h5("Comment: $comment"),
h5("Total words in file = $wordcount"),
h5("Common words = $frequency_gsl per thousand"),
h5("Words in $listname = $perthousand per thousand"),
p;
for ($i2 = 0; $i2 <= $frequency; ++$i2){
print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
}
for ($i2 = 0; $i2 <= $remainder; ++$i2){
print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
}
print '<p> </p><h4>Breakdown by Sublist</h4><p>';
# For some reason using CGI.pm commands for this confuses Internet
+ Explorer
for ($sl = 0; $sl < $number_of_sublists; ++$sl) {
$frequency = int($lists[$sl] / $wordcount * 1000 + 0.5);
$percent = int($lists[$sl] / $totalawl * 100 + 0.5);
$remainder = 100 - $percent;
print br;
for ($i2 = 0; $i2 <= $percent; ++$i2){
print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
}
for ($i2 = 0; $i2 <= $remainder; ++$i2){
print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
}
print ' Sublist ';
print $sl+1;
print ": $percent";
print '% ';
print "($lists[$sl] words)";
print br;
}
print "</p>";
# Print out found words if required
if ($print_words eq "on"){
print h3("Words Found");
@found_words = sort @found_words;
# for ($i = 0; $i <= @found_words; ++$i) {
print p("@found_words");
# }
}
restore_parameters();
print h4 a({-href=>"$program_url"},"Process another file");
print h4 a({-href=>"awlcheckexplained.html"},"What these results m
+ean");
}
|