Beefy Boxes and Bandwidth Generously Provided by pair Networks
We don't bite newbies here... much
 
PerlMonks  

Getstore to avoid of memory?

by wrkrbeee (Scribe)
on Mar 01, 2017 at 21:32 UTC ( [id://1183324]=perlquestion: print w/replies, xml ) Need Help??

wrkrbeee has asked for the wisdom of the Perl Monks concerning the following question:

Hello Perl Monks, I receive the dreaded "out of memory" message when executing the program below. Thought I was reading but one line of the web page, but probably mistaken. Any thoughts are greatly appreciated.

foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; #my $base_url = 'http://www.sec.gov/Archives'; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$ +report_date=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date= +$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$ +1;} $line_count++; #last if $line_count > 29; print "$cik, $form_type, $report_date, $file_date, $name\n"; #if ($line_count++ >30) {next}; #Means no need to read another lin +e, write to file!; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; #$line_count=0; ### Now write the results to file!; #Open the ouput file; #open my $FH_OUT, '>',$write_dir.'/'.$filename or die "Can't open file + $filename"; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #close $FH_IN or die "unable to close $filename"; #Update file_counter; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; }

Replies are listed 'Best First'.
Re: Getstore to avoid of memory?
by Anonymous Monk on Mar 01, 2017 at 21:43 UTC
    You have lines of code that you have commented out and the formatting is misleading at best. Please clean your room: don't present us with a mess.
      My apologies, revised/cleaned code appears below:
      #!/usr/bin/perl -w #use strict; use Tie::File; use Fcntl; use LWP::Simple; #First year you want downloaded files for for: my $startyear=2016; #Last year you want files for: my $endyear=2016; #First qtr you want files for (usually 1): my $startqtr=1; #Last qtr you want files for (usually 4): my $endqtr=1; #The directory you want your index files to be stored in. my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index"; #The directory you are going to download filings to my $direct="G:/Research/SEC filings 10K and 10Q/Data"; #The file that will contain the filings you want to download. my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt"; #Specify the directory containing the results/output; my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\ +data2016.txt'; my $base_url = 'http://www.sec.gov/Archives'; my $line_count=0; my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $count=0; #Initialize file counter variable; my $file_count = 0; my $formget1='(10-K )'; my $formget2='(10-K405 )'; my $formget3='(10KSB )'; my $formget4='(10-KSB )'; my $formget5='(10KSB40 )'; my $formget6='(10-KT )'; my $formget7='(10KT405 )'; my $slash='/'; for($yr=$startyear;$yr<=$endyear;$yr++) { #loop through all the index quarters you specified if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr} for($qtr=$startqtr;$qtr<=$eqtr;$qtr++) { #Open the index file open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$ +qtr$yr.idx: $!"; #Open the file you want to write to. The first time through #the file is opened to "replace" the existing file. #After that, it is opened to append ">>". if ($yr==$startyear && $qtr==$startqtr) {$outfiler=">$outfile";} else{$outfiler=">>$outfile";} open(OUTPUT, "$outfiler") || die "file for 2006 1: $!"; $count=1; while ($line=<INPUT>) { #ignore the first 10 lines because they only contain header informatio +n if ($.<11) {next}; $company_name=substr($line,0,60); $form_type=substr($line,62,12); my $cik=substr($line,74,10); $file_date=substr($line,86,10); $file_date=~s/\-//g; my $fullfilename=trim(substr($line,98,43)); if ($form_type=~/^$formget1(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena +me,"\n"; } elsif ($form_type=~/^$formget2(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget3(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget4(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget5(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget6(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget7(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } } close(INPUT); close(OUTPUT); # check to see if directory exists. If not, create it. unless(-d "$direct$slash$yr"){ mkdir("$direct$slash$yr") or die; } #Open the directory and get put the names of all files into the array +@old opendir(DIR,"$direct$slash$yr")||die "Can't open directory"; @Old=readdir(DIR); #The tie statement assigns the file containing the #files you want to download to the array @New1. tie(@New1,Tie::File,"$outfile", mode=> O_RDWR) or die "Cannot tie file BOO: $!n"; #checks to see what files on the current index listing are not in the +directory #defines a hash called seen. %seen=(); #defines an array called @aonly. @aonly=(); #build lookup table. This step is building a lookup table(hash). #each filename (from OLD) has a value of 1 assigned to it. foreach $item(@Old){$seen{$item}=1} #for each item in the New1 array, which we got from the txt file #containing all the files we want to download, add #it to the array, @aonly, as long is it is not already #in the current directory. We do this so we don't download #a file we have already downloaded. foreach $item(@New1){ $item=~/(edgar\/data\/.*\/)(.*\.txt)/; unless($seen{$item}){ push(@aonly,$item); } } #downloads all the files in the @oanly array which are the files not i +n the directory foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$ +report_date=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date= +$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$ +1;} $line_count++; print "$cik, $form_type, $report_date, $file_date, $name\n"; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; ### Now write the results to file!; #Open the ouput file; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; }

        Never comment out use strict. Just don't.

        In fact, make it help you by defining variables within the scope they are needed, and only there. The memory they take up will be released once the variables go out of scope. That could possibly fix your problem.

        Do your print statements give any clue about where your program runs out of memory? If not, I would pare the program down into steps until you can find out where.

        But God demonstrates His own love toward us, in that while we were yet sinners, Christ died for us. Romans 5:8 (NASB)

        #!/usr/bin/perl -w use strict; use warnings; my $base_url = 'http://www.sec.gov/Archives'; my @aonly=qw{ edgar/data/1122304/0001193125-15-118890.txt' edgar/data/1640984/0001052918-16-000754.txt edgar/data/1640984/0001052918-16-000803.txt edgar/data/1084869/0001437749-16-024828.txt edgar/data/1084869/0001084869-16-000045.txt edgar/data/1084869/0001084869-16-000046.txt edgar/data/1084869/0001084869-16-000047.txt edgar/data/1084869/0000950162-16-000085.txt edgar/data/1511144/0001206774-16-004470.txt edgar/data/1665022/0001665022-16-000001.txt edgar/data/1662965/0001662965-16-000001.txt edgar/data/1651654/0001651654-16-000002.txt edgar/data/1664237/0001664237-16-000001.txt edgar/data/1664513/0001664513-16-000001.txt edgar/data/1665711/0001665711-16-000001.txt edgar/data/1665354/0001665354-16-000001.txt edgar/data/1664635/0001664635-16-000001.txt edgar/data/1625109/0001625109-16-000002.txt edgar/data/1658659/0001658659-16-000002.txt edgar/data/1666635/0001666635-16-000001.txt edgar/data/1614102/0001614102-16-000002.txt edgar/data/1665218/0001665218-16-000001.txt edgar/data/1663921/0000905729-16-000383.txt edgar/data/1666561/0001666561-16-000002.txt edgar/data/1668972/0001668972-16-000002.txt edgar/data/1540531/0000905718-16-001186.txt edgar/data/1540531/0000905718-16-001254.txt }; my $file_count=0; my $FH_OUT=\*STDOUT; my @fields=qw/cik form_type report_date file_date name/; foreach my $filetoget(@aonly) { my $res=get_process_trunc ($filetoget); if (scalar(keys(%$res))) { my $lineout=''; for my $field (@fields){ if ($res->{$field}) {$lineout.=$res->{$field}} $lineout.='|'; } print $FH_OUT $lineout."\n"; } } exit; sub get_process_trunc { # http://www.perlmonks.org/?node_id=1183107 my $filetoget=shift; my $fullfile="$base_url/$filetoget"; my $res={}; use LWP::UserAgent; my $received_size = 0; my $partial = ''; my $ua = LWP::UserAgent->new; my $response = $ua->get($fullfile , ':content_cb'=> sub { my ($data, $response, $protocol) = @_; $partial.=$data; $received_size += length $data; + die if ($received_size>10000); # die inside this callback interrupt th +e request, not the program!! } ); if ($partial) { # print 'length:'.length($partial)."\n"; my $line_count=0; for my $line (split qr/\'\n'/, $partial) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) +{$res->{cik} =$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) +{$res->{form_type} =$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) +{$res->{report_date}=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) +{$res->{file_date} =$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) +{$res->{name} =$1;} $line_count++; last if ($line_count>50); } } # success return $res; } # get_process_trunc
        result
        0001394872|4|20160119|20160121|Ambient Water Corp| 0001394872|5|20150422|20160209|Ambient Water Corp| 0001084869|10-Q|20151227|20160205|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160329|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|SC 13G/A||20160216|1 800 FLOWERS COM INC| 0001511144|13F-HR|20151231|20160212|10-15 ASSOCIATES, INC.| 0001665022|D||20160127|10-20 Channel Center REIT| 0001662965|D||20160106|100 Wall Investments LLC| 0001651654|D/A||20160127|100INSIGHTS, INC| 0001664237|D||20160212|1075 Weybridge Holding, LLC| 0001664513|D||20160205|10Stories, Inc.| 0001665711|D||20160210|11 Madison Investor II LLC| 0001665354|D||20160210|11 Madison Investor LLC| 0001664635|D||20160223|11 Pine, Inc.| 0001625109|D||20160226|11 Roniin, LLC| 0001658659|D/A||20160113|110 Corcoran Property Partners, LLC| 0001666635|D||20160210|1111 Broadway Distribution, LLC| 0001614102|D||20160301|1125 North Fairfax LLC| 0001665218|D||20160128|114 REIT LP| 0001663921|D||20160126|1143 Highland Drive, LLC| 0001666561|D||20160211|1155 Boulder, LLC| 0001668972|D||20160323|11619 Euclid, LLC| 0001540531|13F-HR|20151231|20160216|12 West Capital Management LP| 0001386301|SC 13G||20160307|Research Solutions, Inc.|
        I thought i recognized this type of data
        notice reference to @Discipulus at Re: Split web page, first 30 lines only -- :content_cb trick

        You know how you have sub trim? You need about 5-10 more of those, all those big blocks of code need to be subs

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: perlquestion [id://1183324]
Approved by Corion
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others chilling in the Monastery: (1)
As of 2024-04-18 23:32 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found