awlcheck

Category:	CGI Programming
Author/Contact Info	Robin Turner robin@bilkent.edu.tr
Description:	This is a CGI app to compare a text against the Academic Word List to give an idea of how academic/technical it is. It will work for uploaded plain text or HTML files, or with URLs, and could be adapted to use any set of word lists. There is a working version here. Note: it uses humungous arrays and is thus rather slow.
#!/usr/local/bin/perl use CGI qw/:standard/; use CGI::Carp('fatalsToBrowser'); $CGI::POST_MAX=1024 * 500; # Prevents really big uploads use LWP::Simple; use HTML::TokeParser; # User-specific data - change this! # URL of this program $program_url = 'http://lists.bilkent.edu.tr/~robin/cgibin/awlcheck.cgi +'; # Directory to store files $filebase = '/home/robin/public_html/awlcheck/'; # URL of CSS stylesheet $css = 'awlcheck.css'; # HTML title - diplayed in browser status bar $title = "Check files against AWL"; # What's displayed at the top of the page $heading = "Check files against the Academic Word List"; # Name of main list $listname = "AWL"; # Generic name of sublists $sublist = "sublist"; # The number of sublists $number_of_sublists = 10; # Name of file for General Service List $gsl = "gsl"; # Main program print header, start_html(-title=>"$title", -style=>{-src=>$css}); print_form() unless param; print_results() if param; print end_html; close $file; # The first page sub print_form { print start_multipart_form(), h2("$heading"), h4("Upload a file from your computer ..."), p filefield(-name=>'upload',-size=>70), submit(-label=>'Process File'), p, h4("... or type the URL of a web page here."), p textfield(-name=>'url', -size=>70, -maxlength=>80), submit(-label=>'Process web page'), p, checkbox(-name=>'print_words', -label=>' Check if you want found words to be printed'), p("This program is designed to work on plain text (.txt) or HTML f +iles. The program will process other file types, but the results will + be wrong, so if you have something like a Word document, save the fi +le as plain text first."), p("This program does some serious number-crunching, which can +take a while if you have a large file. If you're uploading your PhD t +hesis, you might want to make dinner while you wait for the results." +); end_form; } # Processing the file sub print_results { my $file = param('upload'); my $print_words = param('print_words'); $url = param('url'); if ($url) { $webpage = get($url); my $stream = HTML::TokeParser->new( \$webpage ); while (my $token = $stream->get_token) { $text_string = $stream->get_text(); @line = split (/\W+/, $text_string); push @words, @line; } } if ($file) { my $stream = HTML::TokeParser->new( $file ); while (my $token = $stream->get_token) { $text_string = $stream->get_text(); @line = split (/\W+/, $text_string); push @words, @line; } } $wordcount = @words; # Compare with General Service List open (GSL, "$filebase$gsl") or print "Cannot open GSL"; undef $INPUT_RECORD_SEPARATOR; $common = <GSL>; close (GSL); foreach (@words){ $search = $_; if ($common =~ /$search/i) {$gslcount++;} } for ($sl = 1; $sl <= $number_of_sublists; ++$sl) { push @lists, 0; # Open the appropriate sublist open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sub +list"; while (<SUBLIST>) { s/-//g; # Get rid of hyphens @sublist = split (/\W+/, $_); } for ($i = 0; $i <= @words; ++$i) { $search = $words[$i]; foreach (@sublist){ if ($search eq $_) { $lists[$sl - 1]++; $totalawl++; # Collect found words for later printing if ($print_words eq "on"){ for ($i2 = 0; $i2 <= @found_words; ++$i2) { if ($found_words[$i2] eq $_) { $already_found = "yes"; } } unless ($already_found eq "yes"){ push @found_words, $_; } $already_found = "no"; } } } } close (SUBLIST); } # Check for empty files and avoid illegal division by zero if ($wordcount < 1) { $wordcount = 1; print "WARNING! This file seems to be empty!"; } $frequency = int($totalawl / $wordcount * 100); $frequency_gsl = int($gslcount / $wordcount * 1000); $remainder = 100 - $frequency; if ($frequency < 1) {$perthousand = "Less than 1";} else {$perthousand = $frequency * 10;} $ratio = $perthousand / $frequency_gsl * 100; if ($ratio > 30) { $comment = '"erudite"'; } elsif ($ratio > 10) { $comment = '"academic"'; } elsif ($ratio > 5) { $comment = '"literate"'; } else {$comment = '"colloquial"';} # Print the results if ($file) { print h2("Results for $file"); } else { print h2("Results for $url"); } print h5("Comment: $comment"), h5("Total words in file = $wordcount"), h5("Common words = $frequency_gsl per thousand"), h5("Words in $listname = $perthousand per thousand"), p; for ($i2 = 0; $i2 <= $frequency; ++$i2){ print img {src=>'red.gif', width=>'2', height=>'20', align +=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } for ($i2 = 0; $i2 <= $remainder; ++$i2){ print img {src=>'white.gif', width=>'2', height=>'20', ali +gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } print '<p> </p><h4>Breakdown by Sublist</h4><p>'; # For some reason using CGI.pm commands for this confuses Internet + Explorer for ($sl = 0; $sl < $number_of_sublists; ++$sl) { $frequency = int($lists[$sl] / $wordcount * 1000 + 0.5); $percent = int($lists[$sl] / $totalawl * 100 + 0.5); $remainder = 100 - $percent; print br; for ($i2 = 0; $i2 <= $percent; ++$i2){ print img {src=>'red.gif', width=>'2', height=>'20', align +=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } for ($i2 = 0; $i2 <= $remainder; ++$i2){ print img {src=>'white.gif', width=>'2', height=>'20', ali +gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } print '  Sublist '; print $sl+1; print ": $percent"; print '% '; print "($lists[$sl] words)"; print br; } print "</p>"; # Print out found words if required if ($print_words eq "on"){ print h3("Words Found"); @found_words = sort @found_words; # for ($i = 0; $i <= @found_words; ++$i) { print p("@found_words"); # } } restore_parameters(); print h4 a({-href=>"$program_url"},"Process another file"); print h4 a({-href=>"awlcheckexplained.html"},"What these results m +ean"); }

Comment on awlcheck Download Code

Back to Code Catacombs