#!/usr/bin/perl
# gutenberg_ftp.pl LIST OF LASTNAMES
#
# Will download all the book titles under each author on the list
# of names into a local archive. It must be run from the directory
# where the archive resides.
#
# Example:
#
# % mkdir archive
# % gutenberg_ftp.pl Conan\ Doyle Conrad Gogol Darwin
#
# After running these commands archive will contain one sub directory
+
# for each author, and each of these will contain all the books for t
+hat
# author on Project Gutenberg.
use LWP::UserAgent;
use URI::URL;
use HTML::TokeParser;
$ua = new LWP::UserAgent;
$ua->agent("efetepeador");
#$ua->proxy([ 'ftp' ], 'http://www-dms.esd.sgi.com:8080/');
# Create a new request.
my $req = new HTTP::Request GET =>
"ftp://sailor.gutenberg.org/pub/gutenberg/by-author.html";
my $res = $ua->request($req, '/tmp/efete');
if(! $res->is_success) {
print STDOUT "Failure to connect to server: " . $res->message;
}
$prsr = HTML::TokeParser->new('/tmp/efete');
while (my $token = $prsr->get_tag("a")) {
my $url = $token->[1]{href} || "-";
my $text = $prsr->get_trimmed_text("/a");
foreach (@ARGV) {
if($text =~ /$_/i) {
print "found $_\n";
push @{$authorURLs{$_}}, ($url);
}
}
# print "$url\t$text\n";
}
AUTHOR:
foreach $author (keys %authorURLs) {
print "Retrieving author $author ";
for (@{$authorURLs{$author}}) {
print "URL $_ contains: \n";
# Create a new request.
my $authReq = new HTTP::Request GET =>
"ftp://sailor.gutenberg.org/pub/gutenberg/$_";
my $authRes = $ua->request($authReq, '/tmp/authefete');
if(! $authRes->is_success) {
print STDOUT "Failure to connect to server: " . $authRes->messag
+e;
}
$authPrsr = HTML::TokeParser->new('/tmp/authefete');
#substitute space for underscore if needed
$author =~ s/ /_/g;
if(! opendir AUTH_DIR, "archive/$author") {
`mkdir archive/$author`;
}
elsif(! opendir AUTH_DIR, "archive/$author") {
print STDOUT "couldn't create directory: archive/$author\n";
next AUTHOR;
}
RETRIEVING:
while (my $authToken = $authPrsr->get_tag("a")) {
my $bookUrl = $authToken->[1]{href} || "-";
my $bookText = $authPrsr->get_trimmed_text("/a");
if($bookUrl =~ /\/([^\/]+\.gz)\b/) {
$book = $1;
#check for file presence before attempting download
rewinddir AUTH_DIR;
while($dirEntry = readdir(AUTH_DIR)) {
if($dirEntry =~ /$book/) {
print "$book already in local repository\n";
next RETRIEVING;
}
}
print "downloading into archive/$author/$book ..\n";
#pause for one minute
sleep(60);
my $bookReq = HTTP::Request->new("GET", $bookUrl);
my $bookRes = $ua->request($bookReq, "archive/$author/$book");
if(! $bookRes->is_success) {
print STDOUT "Failure to connect to server: " . $bookRes->messag
+e .
"\n";
}
}
}
}
}
#---------------------------------------------------------------------
+----
#!/usr/local/bin/perl
#
# indexer.pl
#
# Will generate a DB database file containing a histogram of word
# frequencies of the book archive created by the gutenberg_ftp.pl
# program.
#
# To use it just run it from the directory where the 'archive' direct
+ory
# was created. It will generate two files, one of them called index.d
+b
# containing the histogram and the other called indexedFiles.db conta
+ining
# the names of the files indexed so far (this last one allows us to a
+dd
# books to the archive and index them without analizing again the one
+s we
# already had).
#
# Note that this script is very innefficient and requires a good deal
+ of
# free memory on your system to run. A new version should use MySQL i
+nstead
# of DB files to speed it up.
require 5;
use DB_File; # Access DB databases
use Fcntl; # Needed for above...
use File::Find; # Directory searching
undef $/; # Don't obey line boundaries
$currentKey = 0;
# Single database version:
# Stores file entries in index.db as <NULL><ASCII file number>
# The leading NULL prevents any word entries from colliding.
######################################################################
+######
#unlink("index.db"); # Delete old index.db
tie(%index,'DB_File',"index.db",
O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE) ;
tie(%indexedFiles,'DB_File',"indexedFiles.db",
O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE) ;
foreach $filename (keys %indexedFiles) {
}
$wordCount = 0;
find(\&IndexFile,"archive"); # Index all of the files
#show findings here
print "..indexed $wordCount words\n";
untie(%index); # Close database
untie(%indexedFiles); # Close database
######################################################################
+#####
sub IndexFile {
#only gzipped files
$fName = $_;
if(!/\.gz/) {return; }
if(!exists $indexedFiles{$File::Find::name}) {
if(open TEXTO, "gunzip -c $fName |") {
$indexedFiles{$File::Find::name} = true;
} else {
print "couldn't fork for gunzip: $!\n";
}
print "indexing $File::Find::name\n";
my($text) = <TEXTO>; # Read entire output
$wordCount += &IndexWords($text);
} else {
print "already indexed: $File::Find::name\n";
}
}
######################################################################
+#####
sub IndexWords {
my($words) = @_;
my(%worduniq); # for unique-ifying word list
# Split text into Array of words
my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
@words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # Strip leading
+ punct
grep { length > 1 } # Must be > 1 char
+long
grep { /[a-zA-Z\xc0-\xff]/ } # alpha chars only
grep { !/[0-9\x00-\x01f]/ } # No nums or ctrl chars
@words;
#count, remove duplicates
@words = grep{ $numWords{$_}++ == 0 } @words; # Remove dup
+licates
foreach (@words) {
my($a) = $index{$_};
$a += $numWords{$_};
$index{$_} = $a;
}
return scalar(@words); # Return count of words indexed
}
#-------------------------------------------------------------
#!/usr/local/bin/perl
# glossary_maker.pl BOOK_FILE
#
# Will take a book from the archive created by the gutenberg_ftp.pl
# script and will look at the word count for each of its words on the
# histogram of word frequencies created by indexer.pl. Starting with t
+he
# less frequent words it will prompt the user to choose which ones to
# include on the glossary. When the user stops choosing words the prog
+ram
# will query a web dictionary and print the definition of all the chos
+en
# words to STDOUT.
require 5;
use DB_File; # Access DB databases
use Fcntl; # Needed for above...
use File::Find; # Directory searching
use Term::ReadKey;
require HTML::TreeBuilder;
require HTML::FormatText;
use LWP::UserAgent;
$|=1; # Asynchronous I/O.
undef $/; # Don't obey line boundaries
$gzFile = $ARGV[0] || die "usage: indagador.pl gzippped_file\n";
open TEXTO, "gunzip -c $gzFile |" || die "couldn't fork for gunzip: $
+!\n";
my($text) = <TEXTO>;
#%bookWords = &IndexWords($text);
$bookIndexRef = &IndexWords($text);
tie(%index,'DB_File',"index.db", # Open DB file
O_RDONLY, 0, $DB_File::DB_BTREE) or die "no abre: $!";
$occurs = 1;
print "n to not include\ny to include\nb to o back one\nq to start que
+rying\n\n";
ReadMode 4; # Turn off controls keys
REVISANDO: while(true) {
print "-- Occuring $occurs times: \n";
print ($occurs == 1) ? " ---\n":"s ---\n";
SALIR_ANTES:
foreach $palabrita (sort keys %$bookIndexRef) {
if($index{$palabrita} == $occurs) {
print " $palabrita [n]: ";
while (not defined ($key = ReadKey(-1))) {}
print "$key\n";
if ($key =~ /b|B/) {
ReadMode 0;
print("which one did you miss? ");
$palabrita = read STDIN, $cars, 128;
push @elegidas, $palabrita;
ReadMode 4;
}
if ($key =~ /y|Y/) {
push @elegidas, $palabrita;
}
elsif ($key =~ /q|Q/) {
last REVISANDO;
}
if($contador++ == 10) {
$contador = 0;
last SALIR_ANTES;
}
}
}
$occurs++;
}
# Reset tty mode
ReadMode 0;
$ua = new LWP::UserAgent;
$ua->agent("Diccionario");
# $ua->proxy([ 'http' ], 'http://www-dms.esd.sgi.com:8080/');
for (@elegidas) {
#submit query -- one every five or so seconds
print"Elegida: $_\n";
$elegida = $_;
QueryDict($elegida, $ua);
}
untie(%index); # Close database
sub QueryDict {
my ($palabrita, $agent) = @_;
#
# Create a new request.
#
my $req = new HTTP::Request GET => "http://work.ucsd.edu:5141/cgi-bi
+n/http_webster?isindex=$palabrita&method=exact";
#
# Pass Request to the user agent and get a response back.
#
my $res = $agent->request($req);
#
# Print outcome of the response.
#
if(! $res->is_success) {
print "Failure to connect to server: " . $res->message;
} else {
$html = $res->content;
$p = HTML::TreeBuilder->new;
$p->parse($html);
$formatter = HTML::FormatText->new(leftmargin => 0, rightmargin =>
+ 60);
$result = $formatter->format($p);
@parrafos = split /^\s+/m, $result;
for($x=2; !($parrafos[$x] =~ /^From WordNet/) and $x < 10; $x++) {
print $parrafos[$x]."\n";
}
}
}
sub IndexWords {
my($words) = @_;
my(%worduniq); # for unique-ifying word list
# Split text into Array of words
my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
@words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # Strip leading
+ punct
grep { length > 1 } # Must be > 1 char
+long
grep { /[a-zA-Z\xc0-\xff]/ } # alpha chars only
grep { !/[0-9\x00-\x01f]/ } # No nums or ctrl chars
@words;
#count, remove duplicates
@words = grep{ $numWords{$_}++ == 0 } @words; # Remove dup
+licates
foreach (@words) {
my($a) = $bookIndex{$_};
$a += $numWords{$_};
$bookIndex{$_} = $a;
}
print "indexed " , scalar(@words) . "\n";
return (\%bookIndex); # Return count of words indexed
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.