comment on

#!/usr/local/bin/perl

use CGI qw/:standard/;
use CGI::Carp('fatalsToBrowser');
$CGI::POST_MAX=1024 * 500; # Prevents really big uploads
use LWP::Simple;
use HTML::TokeParser;

# User-specific data - change this!

# URL of this program
$program_url = 'http://lists.bilkent.edu.tr/~robin/cgibin/awlcheck.cgi
+';

# Directory to store files
$filebase = '/home/robin/public_html/awlcheck/';

# URL of CSS stylesheet
$css = 'awlcheck.css';

# HTML title - diplayed in browser status bar
$title = "Check files against AWL";

# What's displayed at the top of the page
$heading = "Check files against the Academic Word List";

# Name of main list
$listname = "AWL";

# Generic name of sublists
$sublist = "sublist"; 

# The number of sublists
$number_of_sublists = 10;

# Name of file for General Service List
$gsl = "gsl";


# Main program
print header,
start_html(-title=>"$title",
    -style=>{-src=>$css});

print_form()    unless param;
print_results() if param;

print end_html;
close $file;


# The first page
sub print_form {

    print start_multipart_form(),
    h2("$heading"),
    h4("Upload a file from your computer ..."),
    p filefield(-name=>'upload',-size=>70),
            submit(-label=>'Process File'),
    p,
    h4("... or type the URL of a web page here."),
    p textfield(-name=>'url',
        -size=>70,
        -maxlength=>80),
        submit(-label=>'Process web page'),
    p,
    checkbox(-name=>'print_words',
        -label=>' Check if you want found words to be printed'),
    p("This program is designed to work on plain text (.txt) or HTML f
+iles. The program will process other file types, but the results will
+ be wrong, so if you have something like a Word document, save the fi
+le as plain text first."),
    
        p("This program does some serious number-crunching, which can 
+take a while if you have a large file. If you're uploading your PhD t
+hesis, you might want to make dinner while you wait for the results."
+);
    
    end_form;
}

# Processing the file
sub print_results {
    
    
    my $file = param('upload');
    my $print_words = param('print_words');
    $url = param('url');
    if ($url) { 
        
        $webpage = get($url);

        my $stream = HTML::TokeParser->new( \$webpage );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            
            @line = split (/\W+/, $text_string);
            push @words, @line;
        }        
         
    }
    
    
    if ($file) {
        my $stream = HTML::TokeParser->new( $file );
        while (my $token = $stream->get_token) {
              $text_string = $stream->get_text();
            
            @line = split (/\W+/, $text_string);
            push @words, @line;
        }
    }

    $wordcount = @words;
    
    
    
        # Compare with General Service List
    open (GSL, "$filebase$gsl") or print "Cannot open GSL";
    undef $INPUT_RECORD_SEPARATOR;
    $common = <GSL>;
    close (GSL);
    
    foreach (@words){
        $search = $_;
        if ($common =~ /$search/i) {$gslcount++;}
    }
    

    for ($sl = 1; $sl <= $number_of_sublists; ++$sl) {
        push @lists, 0;
        # Open the appropriate sublist
        open (SUBLIST, "$filebase$sublist$sl") or die "Cannot open sub
+list";
    
        while (<SUBLIST>) {
            s/-//g; # Get rid of hyphens
            @sublist = split (/\W+/, $_);
        }        
        for ($i = 0; $i <= @words; ++$i) {
            $search = $words[$i];
            
            foreach (@sublist){
                
                if ($search eq $_) {
                    $lists[$sl - 1]++;
                    $totalawl++; 
                    # Collect found words for later printing
                    if ($print_words eq "on"){
                        for ($i2 = 0; $i2 <= @found_words; ++$i2) {
                            if ($found_words[$i2] eq $_) {
                                $already_found = "yes";
                            }
                        }
                        unless ($already_found eq "yes"){
                            push @found_words, $_;
                        }
                    $already_found = "no";
                    }    
                }
            
                
            }
            
        }

    close (SUBLIST);
    }
    
    # Check for empty files and avoid illegal division by zero
    if ($wordcount < 1) {
        $wordcount = 1;
        print "WARNING! This file seems to be empty!";
    }
    
    
    $frequency = int($totalawl / $wordcount * 100);
    $frequency_gsl = int($gslcount / $wordcount * 1000);
    $remainder = 100 - $frequency;
    
    
    if ($frequency < 1) {$perthousand = "Less than 1";} 
    else {$perthousand = $frequency * 10;}
    
    $ratio = $perthousand / $frequency_gsl * 100;
    
    if ($ratio > 30) {
        $comment = '"erudite"';
    } elsif  ($ratio > 10) {
        $comment = '"academic"';
    } elsif  ($ratio > 5) {
        $comment = '"literate"';
    } else {$comment = '"colloquial"';}
    
    # Print the results
    if ($file) {
        print h2("Results for $file");
    } else {
        print h2("Results for $url");
    }
        
    print h5("Comment: $comment"),
    h5("Total words in file = $wordcount"),
    h5("Common words = $frequency_gsl per thousand"),
    h5("Words in $listname = $perthousand per thousand"),
    p;
    
    for ($i2 = 0; $i2 <= $frequency; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
    print '<p>&nbsp;</p><h4>Breakdown by Sublist</h4><p>';
    # For some reason using CGI.pm commands for this confuses Internet
+ Explorer
    
    
    for ($sl = 0; $sl < $number_of_sublists; ++$sl) {
        $frequency = int($lists[$sl] / $wordcount * 1000 + 0.5);    
        $percent = int($lists[$sl] / $totalawl * 100 + 0.5);
        $remainder = 100 - $percent;
        print br;
        for ($i2 = 0; $i2 <= $percent; ++$i2){
            print img {src=>'red.gif', width=>'2', height=>'20', align
+=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
        }
        for ($i2 = 0; $i2 <= $remainder; ++$i2){
            print img {src=>'white.gif', width=>'2', height=>'20', ali
+gn=>'LEFT', border=>'0', padding=>'0', hspace=>'0', vspace=>'0'};
            
        }
        
        
        print '&nbsp; Sublist ';
        print $sl+1;
        print ": $percent";
        print '% ';
        print "($lists[$sl] words)";
        print br;
    }
    print "</p>";
    # Print out found words if required
    if ($print_words eq "on"){
        print h3("Words Found");
        @found_words = sort @found_words;
    
#         for ($i = 0; $i <= @found_words; ++$i) {
            print p("@found_words");
#         }
    }    
    
    restore_parameters();
    
    print h4 a({-href=>"$program_url"},"Process another file");
    print h4 a({-href=>"awlcheckexplained.html"},"What these results m
+ean");
    
    
}
[download]

In reply to awlcheck by solri

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


Syntactic Confectionery Delight
	PerlMonks