#!/usr/local/bin/perl use CGI qw/:standard/; use CGI::Carp('fatalsToBrowser'); $CGI::POST_MAX=1024 * 500; # Prevents really big uploads use LWP::Simple; use HTML::TokeParser; # User-specific data - change this! # URL of this program $program_url = 'http://lists.bilkent.edu.tr/~robin/cgibin/awlcheck.cgi'; # Directory to store files $filebase = '/home/robin/public_html/awlcheck/'; # URL of CSS stylesheet $css = 'awlcheck.css'; # HTML title - diplayed in browser status bar $title = "Check files against AWL"; # What's displayed at the top of the page $heading = "Check files against the Academic Word List"; # Name of main list $listname = "AWL"; # Generic name of sublists $sublist = "sublist"; # The number of sublists $number_of_sublists = 10; # Name of file for General Service List $gsl = "gsl"; # Main program print header, start_html(-title=>"$title", -style=>{-src=>$css}); print_form() unless param; print_results() if param; print end_html; close $file; # The first page sub print_form { print start_multipart_form(), h2("$heading"), h4("Upload a file from your computer ..."), p filefield(-name=>'upload',-size=>70), submit(-label=>'Process File'), p, h4("... or type the URL of a web page here."), p textfield(-name=>'url', -size=>70, -maxlength=>80), submit(-label=>'Process web page'), p, checkbox(-name=>'print_words', -label=>' Check if you want found words to be printed'), p("This program is designed to work on plain text (.txt) or HTML files. The program will process other file types, but the results will be wrong, so if you have something like a Word document, save the file as plain text first."), p("This program does some serious number-crunching, which can take a while if you have a large file. If you're uploading your PhD thesis, you might want to make dinner while you wait for the results."); end_form; } # Processing the file sub print_results { my $file = param('upload'); my $print_words = param('print_words'); $url = param('url'); if ($url) { $webpage = get($url); my $stream = HTML::TokeParser->new( \$webpage ); while (my $token = $stream->get_token) { $text_string = $stream->get_text(); @line = split (/\W+/, $text_string); push @words, @line; } } if ($file) { my $stream = HTML::TokeParser->new( $file ); while (my $token = $stream->get_token) { $text_string = $stream->get_text(); @line = split (/\W+/, $text_string); push @words, @line; } } $wordcount = @words; # Compare with General Service List open (GSL, "$filebase$gsl") or print "Cannot open GSL"; undef $INPUT_RECORD_SEPARATOR; $common = ; close (GSL); foreach (@words){ $search = $_; if ($common =~ /$search/i) {$gslcount++;} } for ($sl = 1; $sl <= $number_of_sublists; ++$sl) { push @lists, 0; # Open the appropriate sublist open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sublist"; while () { s/-//g; # Get rid of hyphens @sublist = split (/\W+/, $_); } for ($i = 0; $i <= @words; ++$i) { $search = $words[$i]; foreach (@sublist){ if ($search eq $_) { $lists[$sl - 1]++; $totalawl++; # Collect found words for later printing if ($print_words eq "on"){ for ($i2 = 0; $i2 <= @found_words; ++$i2) { if ($found_words[$i2] eq $_) { $already_found = "yes"; } } unless ($already_found eq "yes"){ push @found_words, $_; } $already_found = "no"; } } } } close (SUBLIST); } # Check for empty files and avoid illegal division by zero if ($wordcount < 1) { $wordcount = 1; print "WARNING! This file seems to be empty!"; } $frequency = int($totalawl / $wordcount * 100); $frequency_gsl = int($gslcount / $wordcount * 1000); $remainder = 100 - $frequency; if ($frequency < 1) {$perthousand = "Less than 1";} else {$perthousand = $frequency * 10;} $ratio = $perthousand / $frequency_gsl * 100; if ($ratio > 30) { $comment = '"erudite"'; } elsif ($ratio > 10) { $comment = '"academic"'; } elsif ($ratio > 5) { $comment = '"literate"'; } else {$comment = '"colloquial"';} # Print the results if ($file) { print h2("Results for $file"); } else { print h2("Results for $url"); } print h5("Comment: $comment"), h5("Total words in file = $wordcount"), h5("Common words = $frequency_gsl per thousand"), h5("Words in $listname = $perthousand per thousand"), p; for ($i2 = 0; $i2 <= $frequency; ++$i2){ print img {src=>'red.gif', width=>'2', height=>'20', align=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } for ($i2 = 0; $i2 <= $remainder; ++$i2){ print img {src=>'white.gif', width=>'2', height=>'20', align=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } print '

Breakdown by Sublist

'; # For some reason using CGI.pm commands for this confuses Internet Explorer for ($sl = 0; $sl < $number_of_sublists; ++$sl) { $frequency = int($lists[$sl] / $wordcount * 1000 + 0.5); $percent = int($lists[$sl] / $totalawl * 100 + 0.5); $remainder = 100 - $percent; print br; for ($i2 = 0; $i2 <= $percent; ++$i2){ print img {src=>'red.gif', width=>'2', height=>'20', align=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } for ($i2 = 0; $i2 <= $remainder; ++$i2){ print img {src=>'white.gif', width=>'2', height=>'20', align=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'}; } print ' Sublist '; print $sl+1; print ": $percent"; print '% '; print "($lists[$sl] words)"; print br; } print "

"; # Print out found words if required if ($print_words eq "on"){ print h3("Words Found"); @found_words = sort @found_words; # for ($i = 0; $i <= @found_words; ++$i) { print p("@found_words"); # } } restore_parameters(); print h4 a({-href=>"$program_url"},"Process another file"); print h4 a({-href=>"awlcheckexplained.html"},"What these results mean"); }