http://qs321.pair.com?node_id=534384

Articuno has asked for the wisdom of the Perl Monks concerning the following question:

Hi Monks!

As I was asking in CB, I have a Perl script that reads a 25 MB file to $_ (undef'ing $/) , and does a lot of regex matches against it (like a state machine).

Now I know (thanks diotalevi) that regex matches are cloning my whole string... so I'd like to ask you all some suggestions for lowering the memory consumption :-)

Below is my script:

(disclaimer - this is my full code, if need I'll update the node cutting unneeded things)

(ps - the code reads all *.ms files in the directory, but currently i'm testing with only one file)

(ps2 - the examples and text were in portuguese, but i've translated them here)

(ps3 - the stoplist is just a (very tiny) list of so called "stopwords" - words like "de","do","da" (of), etc...)

# Unify name tags # <Larry> da <Silva> <Wall> X <John> S. <Doe> # becomes # <Larry da Silva Wall> X <John S. Doe> use strict; use warnings; use diagnostics; use Storable; my $debug = 0; $" = ''; if (@ARGV < 1) { print <<INFO Usage: perl $0 [dir] Where [dir] is the directory where the .ms files are located INFO ;exit 1; } my ($dir) = @ARGV; $dir ||= './'; open SW, "/matchsimile/stoplist"; my @stopwords = <SW>; close SW; @stopwords = map { split /\s/ } @stopwords; my %stopword; @stopword{@stopwords} = undef; opendir DIR, $dir; my @files = map { $dir.'/'.$_ } grep { /\.ms$/ } sort readdir(DIR); closedir DIR; undef $/; my $caps = qr/[A-ZÄÅÆÇÈÒÉÜÓÊÝÔËðÕÌßÖÍÎØÏÙÐþÚÑÛÀÁÂÃ]/; my $texto = qr/[A-Za-zÄÅÆÇÈÒÉÜÓÊæÝÔËðçÞÕÌúñèßÖÍûòéàÎüóêáØÏýôëâÙÐþõìãÚÑ +ÿöíäÛÀîåÁøïÂùÃ\s]+/; my @buffer; for my $file (@files) { @buffer = (); open IN, "< $file" or die "'$file' couldn't be opened"; $_ = <IN>; close IN; open OUT, "> $file.new"; my $aux = select(OUT); $|=1; select($aux); print STDERR "Processando $file\n"; my $state = 'TEXT'; my $total_size = length($_); s/(<[^>]*)<([^<>]*>)/$1$2/g; # <<foo> <bar> baz <quux>> vira <foo +bar baz quux> study $_; /^/gc; my $tick = time; until(pos($_) == $total_size) { print STDERR sprintf "\r%10d bytes",$total_size - pos($_) if time +> $tick && ($tick = time); if ($debug) { print STDERR "[@buffer]"; print STDERR snippet(); print STDERR "\n"; } if ($state eq 'TEXT') { if (/\G(\s*<$caps[^<>]*>\s*)/gc) { $state = 'NAME'; push @buff +er, $1; next; } if (/\G<([^<>]*)>/gc || /\G([^<>\s]*\s*)/gc) { print OUT $1; +next; } die "STRANGE FOO ".snippet(); } if ($state eq 'NAME') { if (/\G(<$caps[^<>]*>\s*)/gc) { push @buffer, $1; next } if (/\G<([^<>]*)>/gc) { flush_name(); $state = 'TEXT'; print +OUT $1; next; } if (/\G(\s*([A-Z]\s*\.|(?!\s)$texto(?<!\s))\s*)/gc) { my ($token,$subtoken) = ($1,$2); if ($subtoken =~ /\b(?:x|e|ou)\b/i) { flush_name(); $state = ' +TEXT'; print OUT $token; next; } $subtoken =~ tr/ÄÅÆÇÈÒÉÜÓÊæÝÔËðçÞÕÌúñèßÖÍûòéàÎüóêáØÏýôëâÙÐþõìã +ÚÑÿöíäÛÀîåÁøïÂùÃ/AAACEOEUOEaYOEecTOIunesOIuoeaIuoeaOIyoeaUEtoiaUNyoia +UAiaAoiAuA/; $subtoken =~ tr/A-Z/a-z/; if (length($subtoken) <= 2 || exists $stopword{$subtoken}) { p +ush @buffer, $token; next; } flush_name(); $state = 'TEXT'; print OUT $token; next; } if (/\G([^<>\s]+\s*)/gc) { flush_name(); $state = 'TEXT'; print OUT $1; next; } die "STRANGE FOO ".snippet(); } die "STRANGE FOO ".snippet(); } close OUT; } print STDERR "\n"; exit 0; sub snippet { my $text = "{".substr($_,pos($_),42)."}"; $text =~ s/[\r\n]/|/g; return $text; } sub flush_name { my $count = 0; for my $token (@buffer) { if ( index($token,'<') >= 0 ) { if (++$count == 2) { my $buffer = "@buffer"; $buffer =~ s/>([^<>]*)</$1/g; print OUT $buffer; @buffer = (); return; } } } for my $tk (@buffer) { $tk =~ s/[<>]//g; print OUT $tk; } @buffer = (); return; }
-- 6x9=42