wget http://www.astro.sunysb.edu/fwalter/AST389/TEXTS/Nightfall.htm html2text-cpp Nightfall.htm >nightfall.txt for i in {1..1000}; do cat nightfall.txt >>in.txt; done #### #!/bin/bash cat *.txt | \ tr -d '[:punct:]' | \ sed 's/[0-9]//g' | \ sed 's/w\(as\|ere\)/be/gi' | \ sed 's/ need.* / need /gi' | \ sed 's/ .*meant.* / mean /gi' | \ sed 's/ .*work.* / work /gi' | \ sed 's/ .*read.* / read /gi' | \ sed 's/ .*allow.* / allow /gi' | \ sed 's/ .*gave.* / give /gi' | \ sed 's/ .*bought.* / buy /gi' | \ sed 's/ .*want.* / want /gi' | \ sed 's/ .*hear.* / hear /gi' | \ sed 's/ .*came.* / come /gi' | \ sed 's/ .*destr.* / destroy /gi' | \ sed 's/ .*paid.* / pay /gi' | \ sed 's/ .*selve.* / self /gi' | \ sed 's/ .*self.* / self /gi' | \ sed 's/ .*cities.* / city /gi' | \ sed 's/ .*fight.* / fight /gi' | \ sed 's/ .*creat.* / create /gi' | \ sed 's/ .*makin.* / make /gi' | \ sed 's/ .*includ.* / include /gi' | \ sed 's/ .*mean.* / mean /gi' | \ sed 's/ talk.* / talk /gi' | \ sed 's/ going / go /gi' | \ sed 's/ getting / get /gi' | \ sed 's/ start.* / start /gi' | \ sed 's/ goes / go /gi' | \ sed 's/ knew / know /gi' | \ sed 's/ trying / try /gi' | \ sed 's/ tried / try /gi' | \ sed 's/ told / tell /gi' | \ sed 's/ coming / come /gi' | \ sed 's/ saying / say /gi' | \ sed 's/ men / man /gi' | \ sed 's/ women / woman /gi' | \ sed 's/ took / take /gi' | \ sed 's/ tak.* / take /gi' | \ sed 's/ lying / lie /gi' | \ sed 's/ dying / die /gi' | \ sed 's/ made /make /gi' | \ sed 's/ used.* / use /gi' | \ sed 's/ using.* / use /gi' \ >|out-sed.dat #### % time ./re.sh real 0m5,201s user 0m43,394s sys 0m1,302s #### #!/usr/bin/perl use strict; use warnings; use 5.36.0; my $BLOCKSIZE = 1024 * 1024 * 128; my $data; my $IN; my $out='out-perl.dat'; truncate $out, 0; open my $OUT, '>>', $out; my @text = glob("*.txt"); foreach my $t (@text) { open($IN, '<', $t) or next; read($IN, $data, $BLOCKSIZE); my @line = split /\n/, $data; foreach (@line) { s/[[:punct:]]/ /g; tr/[0-9]//d; s/w(as|ere)/be/gi; s/\sneed.*/ need /gi; s/\s.*meant.*/ mean /gi; s/\s.*work.*/ work /gi; s/\s.*read.*/ read /gi; s/\s.*allow.*/ allow /gi; s/\s.*gave.*/ give /gi; s/\s.*bought.*/ buy /gi; s/\s.*want.*/ want /gi; s/\s.*hear.*/ hear /gi; s/\s.*came.*/ come /gi; s/\s.*destr.*/ destroy /gi; s/\s.*paid.*/ pay /gi; s/\s.*selve.*/ self /gi; s/\s.*self.*/ self /gi; s/\s.*cities.*/ city /gi; s/\s.*fight.*/ fight /gi; s/\s.*creat.*/ create /gi; s/\s.*makin.*/ make /gi; s/\s.*includ.*/ include /gi; s/\s.*mean.*/ mean /gi; s/\stalk.*/ talk /gi; s/\sgoing / go /gi; s/\sgetting / get /gi; s/\sstart.*/ start /gi; s/\sgoes / go /gi; s/\sknew / know /gi; s/\strying / try /gi; s/\stried / try /gi; s/\stold / tell /gi; s/\scoming / come /gi; s/\ssaying / say /gi; s/\smen / man /gi; s/\swomen / woman /gi; s/\stook / take /gi; s/\stak.*/ take /gi; s/\slying / lie /gi; s/\sdying / die /gi; s/\smade /make /gi; s/\sused.*/ use /gi; s/\susing.*/ use /gi; close $IN; print $OUT "$_\n"; } } #### % time ./re1.pl real 4m1,655s user 4m29,242s sys 0m0,380s #### % time ./re2.pl real 1m5,096s user 1m11,889s sys 0m0,524s