comment on


#!/usr/bin/perl

# gutenberg_ftp.pl LIST OF LASTNAMES
#
#  Will download all the book titles under each author on the list 
#  of names into a local archive. It must be run from the directory
#  where the archive resides.
#
#  Example:
#
#  % mkdir archive
#  % gutenberg_ftp.pl Conan\ Doyle Conrad Gogol Darwin
#
#  After running these commands archive will contain one sub directory
+ 
#  for each author, and each of these will contain all the books for t
+hat
#  author on Project Gutenberg.

use LWP::UserAgent;
use URI::URL;
use HTML::TokeParser;

$ua = new LWP::UserAgent;
$ua->agent("efetepeador");

#$ua->proxy([ 'ftp' ], 'http://www-dms.esd.sgi.com:8080/');  

# Create a new request.
my $req = new HTTP::Request GET => 
    "ftp://sailor.gutenberg.org/pub/gutenberg/by-author.html";
my $res = $ua->request($req, '/tmp/efete');
if(! $res->is_success) {
    print STDOUT "Failure to connect to server: " . $res->message;
}

$prsr = HTML::TokeParser->new('/tmp/efete');

while (my $token = $prsr->get_tag("a")) {
    my $url = $token->[1]{href} || "-";
    my $text = $prsr->get_trimmed_text("/a");

    foreach (@ARGV) {
      if($text =~ /$_/i) {
    print "found $_\n";
    push @{$authorURLs{$_}}, ($url);
      }       
    }
#    print "$url\t$text\n";
} 

AUTHOR:
foreach $author (keys %authorURLs) {
  print "Retrieving author $author ";
  for (@{$authorURLs{$author}}) { 
    print "URL $_ contains: \n";
    # Create a new request.
    my $authReq = new HTTP::Request GET => 
      "ftp://sailor.gutenberg.org/pub/gutenberg/$_";
    my $authRes = $ua->request($authReq, '/tmp/authefete');
    if(! $authRes->is_success) {
      print STDOUT "Failure to connect to server: " . $authRes->messag
+e;
    }
    
    $authPrsr = HTML::TokeParser->new('/tmp/authefete');

    #substitute space for underscore if needed    
    $author =~ s/ /_/g;

    if(! opendir AUTH_DIR, "archive/$author") {
      `mkdir archive/$author`;
    } 
    elsif(! opendir AUTH_DIR, "archive/$author") {
      print STDOUT "couldn't create directory: archive/$author\n";
      next AUTHOR;
    }

  RETRIEVING:    
    while (my $authToken = $authPrsr->get_tag("a")) {
      my $bookUrl = $authToken->[1]{href} || "-";
      my $bookText = $authPrsr->get_trimmed_text("/a");      
      if($bookUrl =~ /\/([^\/]+\.gz)\b/) {
    $book = $1;
    #check for file presence before attempting download
    rewinddir AUTH_DIR;
    while($dirEntry = readdir(AUTH_DIR)) {
      if($dirEntry =~ /$book/) { 
        print "$book already in local repository\n"; 
        next RETRIEVING;
      } 
    }
    print "downloading into archive/$author/$book ..\n";
    #pause for one minute
        sleep(60);
    my $bookReq = HTTP::Request->new("GET", $bookUrl);
    my $bookRes = $ua->request($bookReq, "archive/$author/$book");
    if(! $bookRes->is_success) {
      print STDOUT "Failure to connect to server: " . $bookRes->messag
+e .
        "\n";
    }
      }
    }
  }  
}





#---------------------------------------------------------------------
+----

#!/usr/local/bin/perl
#
# indexer.pl  
#
#  Will generate a DB database file containing a histogram of word 
#  frequencies of the book archive created by the gutenberg_ftp.pl
#  program. 
#
#  To use it just run it from the directory where the 'archive' direct
+ory
#  was created. It will generate two files, one of them called index.d
+b
#  containing the histogram and the other called indexedFiles.db conta
+ining
#  the names of the files indexed so far (this last one allows us to a
+dd 
#  books to the archive and index them without analizing again the one
+s we 
#  already had).
#
#  Note that this script is very innefficient and requires a good deal
+ of
#  free memory on your system to run. A new version should use MySQL i
+nstead
#  of DB files to speed it up.

require 5;
use DB_File;    # Access DB databases
use Fcntl;      # Needed for above...
use File::Find; # Directory searching
undef $/; # Don't obey line boundaries
$currentKey = 0;

# Single database version:
#    Stores file entries in index.db as <NULL><ASCII file number>
#    The leading NULL prevents any word entries from colliding.
######################################################################
+######
#unlink("index.db");         # Delete old index.db
tie(%index,'DB_File',"index.db", 
    O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE) ;

tie(%indexedFiles,'DB_File',"indexedFiles.db", 
    O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE) ;

foreach $filename (keys %indexedFiles) {
}

$wordCount = 0;

find(\&IndexFile,"archive");       # Index all of the files

#show findings here

print "..indexed $wordCount words\n";


untie(%index);                      # Close database
untie(%indexedFiles);                      # Close database
######################################################################
+#####
sub IndexFile {  
  #only gzipped files
  $fName = $_;

  if(!/\.gz/) {return; }
  
  if(!exists $indexedFiles{$File::Find::name}) {    
    if(open TEXTO, "gunzip -c $fName |") {
      $indexedFiles{$File::Find::name} = true;
    } else {
      print "couldn't fork for gunzip: $!\n";
    }

    print "indexing $File::Find::name\n";

    my($text) = <TEXTO>;        # Read entire output
    $wordCount += &IndexWords($text);    
  } else {
    print "already indexed: $File::Find::name\n";
  }
}
######################################################################
+#####
sub IndexWords {
    my($words) = @_;
    my(%worduniq);          # for unique-ifying word list
    # Split text into Array of words
    my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
    @words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ }  # Strip leading
+ punct
             grep { length > 1 }                   # Must be > 1 char 
+long
             grep { /[a-zA-Z\xc0-\xff]/ }       # alpha chars only
         grep { !/[0-9\x00-\x01f]/ }           # No nums or ctrl chars
             @words;

    #count, remove duplicates
    @words = grep{ $numWords{$_}++ == 0 } @words;         # Remove dup
+licates

    foreach (@words) {
        my($a) = $index{$_};
        $a += $numWords{$_};
        $index{$_} = $a;
    }

    return scalar(@words);    # Return count of words indexed
}



#-------------------------------------------------------------


#!/usr/local/bin/perl

# glossary_maker.pl BOOK_FILE
#
# Will take a book from the archive created by the gutenberg_ftp.pl 
# script and will look at the word count for each of its words on the 
# histogram of word frequencies created by indexer.pl. Starting with t
+he 
# less frequent words it will prompt the user to choose which ones to
# include on the glossary. When the user stops choosing words the prog
+ram
# will query a web dictionary and print the definition of all the chos
+en
# words to STDOUT.

require 5;
use DB_File;    # Access DB databases
use Fcntl;      # Needed for above...
use File::Find; # Directory searching
use Term::ReadKey;

require HTML::TreeBuilder;
require HTML::FormatText;
use LWP::UserAgent;

$|=1;     # Asynchronous I/O.
undef $/; # Don't obey line boundaries

$gzFile = $ARGV[0] || die "usage: indagador.pl gzippped_file\n";

open TEXTO, "gunzip -c $gzFile |" ||  die "couldn't fork for gunzip: $
+!\n";

my($text) = <TEXTO>;     

#%bookWords = &IndexWords($text);   
$bookIndexRef = &IndexWords($text);   

tie(%index,'DB_File',"index.db",    # Open DB file
    O_RDONLY, 0, $DB_File::DB_BTREE) or die "no abre: $!";

$occurs = 1;

print "n to not include\ny to include\nb to o back one\nq to start que
+rying\n\n";

ReadMode 4; # Turn off controls keys

REVISANDO: while(true) {
  print "-- Occuring $occurs times: \n";
  print ($occurs == 1) ? " ---\n":"s ---\n";

 SALIR_ANTES:
  foreach $palabrita (sort keys %$bookIndexRef) {   
    if($index{$palabrita} == $occurs) {
      print " $palabrita [n]: ";

      while (not defined ($key = ReadKey(-1))) {}

      print "$key\n";
      if ($key =~ /b|B/) {
    ReadMode 0;
    print("which one did you miss? ");
    $palabrita = read STDIN, $cars, 128;
    push @elegidas, $palabrita;
    ReadMode 4;
      }

      if ($key =~ /y|Y/) {
    push @elegidas, $palabrita;
      }
      elsif ($key =~ /q|Q/) {
    last REVISANDO;
      }

      if($contador++ == 10) {
    $contador = 0;
    last SALIR_ANTES;
      }

    }
  }  
  $occurs++;
}

# Reset tty mode
ReadMode 0;


$ua = new LWP::UserAgent;
$ua->agent("Diccionario");
# $ua->proxy([ 'http' ], 'http://www-dms.esd.sgi.com:8080/');

for (@elegidas) {
  #submit query -- one every five or so seconds

  print"Elegida: $_\n";

  $elegida = $_;

  QueryDict($elegida, $ua);

}

untie(%index);                      # Close database



sub QueryDict {
  my ($palabrita, $agent) = @_;
  
  #
  # Create a new request.
  #
  
  my $req = new HTTP::Request GET => "http://work.ucsd.edu:5141/cgi-bi
+n/http_webster?isindex=$palabrita&method=exact";
  
  #
  # Pass Request to the user agent and get a response back.
  #
  my $res = $agent->request($req);
  
  #
  # Print outcome of the response.
  #
  if(! $res->is_success) {
    print "Failure to connect to server: " . $res->message;
  } else {


    $html = $res->content;

    $p = HTML::TreeBuilder->new;
    $p->parse($html);
    $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin =>
+ 60);
    $result = $formatter->format($p);
    
    @parrafos = split /^\s+/m, $result;
    
    for($x=2; !($parrafos[$x] =~ /^From WordNet/) and $x < 10; $x++) {
      print $parrafos[$x]."\n";
    }    
  }
}

sub IndexWords {
    my($words) = @_;
    my(%worduniq);          # for unique-ifying word list
    # Split text into Array of words
    my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
    @words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ }  # Strip leading
+ punct
             grep { length > 1 }                   # Must be > 1 char 
+long
             grep { /[a-zA-Z\xc0-\xff]/ }       # alpha chars only
         grep { !/[0-9\x00-\x01f]/ }           # No nums or ctrl chars
             @words;

    #count, remove duplicates
    @words = grep{ $numWords{$_}++ == 0 } @words;         # Remove dup
+licates

    foreach (@words) {
        my($a) = $bookIndex{$_};
        $a += $numWords{$_};
        $bookIndex{$_} = $a;
    }

    print "indexed " , scalar(@words) . "\n";
    
    return (\%bookIndex);    # Return count of words indexed
}
[download]
In reply to Glossary Maker by gregorovius
Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.

Think about Loose Coupling
	PerlMonks