wget http://www.astro.sunysb.edu/fwalter/AST389/TEXTS/Nightfall.htm
html2text-cpp Nightfall.htm >nightfall.txt
for i in {1..1000}; do cat nightfall.txt >>in.txt; done
####
#!/bin/bash
cat *.txt | \
tr -d '[:punct:]' | \
sed 's/[0-9]//g' | \
sed 's/w\(as\|ere\)/be/gi' | \
sed 's/ need.* / need /gi' | \
sed 's/ .*meant.* / mean /gi' | \
sed 's/ .*work.* / work /gi' | \
sed 's/ .*read.* / read /gi' | \
sed 's/ .*allow.* / allow /gi' | \
sed 's/ .*gave.* / give /gi' | \
sed 's/ .*bought.* / buy /gi' | \
sed 's/ .*want.* / want /gi' | \
sed 's/ .*hear.* / hear /gi' | \
sed 's/ .*came.* / come /gi' | \
sed 's/ .*destr.* / destroy /gi' | \
sed 's/ .*paid.* / pay /gi' | \
sed 's/ .*selve.* / self /gi' | \
sed 's/ .*self.* / self /gi' | \
sed 's/ .*cities.* / city /gi' | \
sed 's/ .*fight.* / fight /gi' | \
sed 's/ .*creat.* / create /gi' | \
sed 's/ .*makin.* / make /gi' | \
sed 's/ .*includ.* / include /gi' | \
sed 's/ .*mean.* / mean /gi' | \
sed 's/ talk.* / talk /gi' | \
sed 's/ going / go /gi' | \
sed 's/ getting / get /gi' | \
sed 's/ start.* / start /gi' | \
sed 's/ goes / go /gi' | \
sed 's/ knew / know /gi' | \
sed 's/ trying / try /gi' | \
sed 's/ tried / try /gi' | \
sed 's/ told / tell /gi' | \
sed 's/ coming / come /gi' | \
sed 's/ saying / say /gi' | \
sed 's/ men / man /gi' | \
sed 's/ women / woman /gi' | \
sed 's/ took / take /gi' | \
sed 's/ tak.* / take /gi' | \
sed 's/ lying / lie /gi' | \
sed 's/ dying / die /gi' | \
sed 's/ made /make /gi' | \
sed 's/ used.* / use /gi' | \
sed 's/ using.* / use /gi' \
>|out-sed.dat
##
##
% time ./re.sh
real 0m5,201s
user 0m43,394s
sys 0m1,302s
##
##
#!/usr/bin/perl
use strict;
use warnings;
use 5.36.0;
my $BLOCKSIZE = 1024 * 1024 * 128;
my $data;
my $IN;
my $out='out-perl.dat';
truncate $out, 0;
open my $OUT, '>>', $out;
my @text = glob("*.txt");
foreach my $t (@text) {
open($IN, '<', $t) or next;
read($IN, $data, $BLOCKSIZE);
my @line = split /\n/, $data;
foreach (@line) {
s/[[:punct:]]/ /g;
tr/[0-9]//d;
s/w(as|ere)/be/gi;
s/\sneed.*/ need /gi;
s/\s.*meant.*/ mean /gi;
s/\s.*work.*/ work /gi;
s/\s.*read.*/ read /gi;
s/\s.*allow.*/ allow /gi;
s/\s.*gave.*/ give /gi;
s/\s.*bought.*/ buy /gi;
s/\s.*want.*/ want /gi;
s/\s.*hear.*/ hear /gi;
s/\s.*came.*/ come /gi;
s/\s.*destr.*/ destroy /gi;
s/\s.*paid.*/ pay /gi;
s/\s.*selve.*/ self /gi;
s/\s.*self.*/ self /gi;
s/\s.*cities.*/ city /gi;
s/\s.*fight.*/ fight /gi;
s/\s.*creat.*/ create /gi;
s/\s.*makin.*/ make /gi;
s/\s.*includ.*/ include /gi;
s/\s.*mean.*/ mean /gi;
s/\stalk.*/ talk /gi;
s/\sgoing / go /gi;
s/\sgetting / get /gi;
s/\sstart.*/ start /gi;
s/\sgoes / go /gi;
s/\sknew / know /gi;
s/\strying / try /gi;
s/\stried / try /gi;
s/\stold / tell /gi;
s/\scoming / come /gi;
s/\ssaying / say /gi;
s/\smen / man /gi;
s/\swomen / woman /gi;
s/\stook / take /gi;
s/\stak.*/ take /gi;
s/\slying / lie /gi;
s/\sdying / die /gi;
s/\smade /make /gi;
s/\sused.*/ use /gi;
s/\susing.*/ use /gi;
close $IN;
print $OUT "$_\n";
}
}
##
##
% time ./re1.pl
real 4m1,655s
user 4m29,242s
sys 0m0,380s
##
##
% time ./re2.pl
real 1m5,096s
user 1m11,889s
sys 0m0,524s