Getstore to avoid of memory?

wrkrbeee has asked for the wisdom of the Perl Monks concerning the following question:

Replies are listed 'Best First'.
Re: Getstore to avoid of memory? by Anonymous Monk on Mar 01, 2017 at 21:43 UTC
You have lines of code that you have commented out and the formatting is misleading at best. Please clean your room: don't present us with a mess.	[reply]
Re^2: Getstore to avoid of memory? by wrkrbeee (Scribe) on Mar 01, 2017 at 22:08 UTC
My apologies, revised/cleaned code appears below: #!/usr/bin/perl -w #use strict; use Tie::File; use Fcntl; use LWP::Simple; #First year you want downloaded files for for: my $startyear=2016; #Last year you want files for: my $endyear=2016; #First qtr you want files for (usually 1): my $startqtr=1; #Last qtr you want files for (usually 4): my $endqtr=1; #The directory you want your index files to be stored in. my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index"; #The directory you are going to download filings to my $direct="G:/Research/SEC filings 10K and 10Q/Data"; #The file that will contain the filings you want to download. my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt"; #Specify the directory containing the results/output; my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\ +data2016.txt'; my $base_url = 'http://www.sec.gov/Archives'; my $line_count=0; my $cik=-99; my $form_type=""; my $report_date=-99; my $file_date=-99; my $name=""; my $count=0; #Initialize file counter variable; my $file_count = 0; my $formget1='(10-K )'; my $formget2='(10-K405 )'; my $formget3='(10KSB )'; my $formget4='(10-KSB )'; my $formget5='(10KSB40 )'; my $formget6='(10-KT )'; my $formget7='(10KT405 )'; my $slash='/'; for($yr=$startyear;$yr<=$endyear;$yr++) { #loop through all the index quarters you specified if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr} for($qtr=$startqtr;$qtr<=$eqtr;$qtr++) { #Open the index file open(INPUT, "$inddirect/company$qtr$yr.idx") \|\| die "file for company$ +qtr$yr.idx: $!"; #Open the file you want to write to. The first time through #the file is opened to "replace" the existing file. #After that, it is opened to append ">>". if ($yr==$startyear && $qtr==$startqtr) {$outfiler=">$outfile";} else{$outfiler=">>$outfile";} open(OUTPUT, "$outfiler") \|\| die "file for 2006 1: $!"; $count=1; while ($line=<INPUT>) { #ignore the first 10 lines because they only contain header informatio +n if ($.<11) {next}; $company_name=substr($line,0,60); $form_type=substr($line,62,12); my $cik=substr($line,74,10); $file_date=substr($line,86,10); $file_date=~s/\-//g; my $fullfilename=trim(substr($line,98,43)); if ($form_type=~/^$formget1(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena +me,"\n"; } elsif ($form_type=~/^$formget2(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget3(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget4(?!\/)/) { print OUTPUT "$fullfilename\n" ; $count++; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n +"; } elsif ($form_type=~/^$formget5(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget6(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } elsif ($form_type=~/^$formget7(?!\/)/) { print OUTPUT "$fullfilename\n" ; print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\ +n"; $count++; } } close(INPUT); close(OUTPUT); # check to see if directory exists. If not, create it. unless(-d "$direct$slash$yr"){ mkdir("$direct$slash$yr") or die; } #Open the directory and get put the names of all files into the array +@old opendir(DIR,"$direct$slash$yr")\|\|die "Can't open directory"; @Old=readdir(DIR); #The tie statement assigns the file containing the #files you want to download to the array @New1. tie(@New1,Tie::File,"$outfile", mode=> O_RDWR) or die "Cannot tie file BOO: $!n"; #checks to see what files on the current index listing are not in the +directory #defines a hash called seen. %seen=(); #defines an array called @aonly. @aonly=(); #build lookup table. This step is building a lookup table(hash). #each filename (from OLD) has a value of 1 assigned to it. foreach $item(@Old){$seen{$item}=1} #for each item in the New1 array, which we got from the txt file #containing all the files we want to download, add #it to the array, @aonly, as long is it is not already #in the current directory. We do this so we don't download #a file we have already downloaded. foreach $item(@New1){ $item=~/(edgar\/data\/.\/)(.\.txt)/; unless($seen{$item}){ push(@aonly,$item); } } #downloads all the files in the @oanly array which are the files not i +n the directory foreach $filetoget(@aonly) { $fullfile="$base_url/$filetoget"; $fonly=$filetoget; for my $line (split qr/\'\n'/, get($fullfile)) { while ($line_count < 2) { if($line=~m/^\sCENTRAL\sINDEX\sKEY:\s(\d)/m){$cik=$1;} if($line=~m/^\sFORM\sTYPE:\s(.$)/m){$form_type=$1;} if($line=~m/^\sCONFORMED\sPERIOD\sOF\sREPORT:\s(\d)/m){$ +report_date=$1;} if($line=~m/^\sFILED\sAS\sOF\sDATE:\s(\d)/m){$file_date= +$1;} if($line=~m/^\sCOMPANY\sCONFORMED\sNAME:\s(.$)/m){$name=$ +1;} $line_count++; print "$cik, $form_type, $report_date, $file_date, $name\n"; print "$line_count\n"; } # end of while loop; } # end of for my $line split loop; ### Now write the results to file!; #Open the ouput file; open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir"; #Save/write results/output; $,='\|'; print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n"; $line_count=0; #Update file counter; ++$file_count; print "$file_count lines read from $fullfile\n"; #closedir($dir_handle); close($FH_OUT); } # end of foreach file to get loop; #end of qtr loop } #end of year loop } sub trim { my $new_phrase; my $phrase = shift(@_); $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; $new_phrase = "$phrase"; return "$new_phrase"; } [download]	[reply] [d/l]
Re^3: Getstore to avoid of memory? by GotToBTru (Prior) on Mar 01, 2017 at 22:42 UTC
Never comment out `use strict`. Just don't. In fact, make it help you by defining variables within the scope they are needed, and only there. The memory they take up will be released once the variables go out of scope. That could possibly fix your problem. Do your print statements give any clue about where your program runs out of memory? If not, I would pare the program down into steps until you can find out where. But God demonstrates His own love toward us, in that while we were yet sinners, Christ died for us. Romans 5:8 (NASB)	[reply] [d/l]
Re^3: Getstore to avoid of memory? by huck (Prior) on Mar 02, 2017 at 01:59 UTC
#!/usr/bin/perl -w use strict; use warnings; my $base_url = 'http://www.sec.gov/Archives'; my @aonly=qw{ edgar/data/1122304/0001193125-15-118890.txt' edgar/data/1640984/0001052918-16-000754.txt edgar/data/1640984/0001052918-16-000803.txt edgar/data/1084869/0001437749-16-024828.txt edgar/data/1084869/0001084869-16-000045.txt edgar/data/1084869/0001084869-16-000046.txt edgar/data/1084869/0001084869-16-000047.txt edgar/data/1084869/0000950162-16-000085.txt edgar/data/1511144/0001206774-16-004470.txt edgar/data/1665022/0001665022-16-000001.txt edgar/data/1662965/0001662965-16-000001.txt edgar/data/1651654/0001651654-16-000002.txt edgar/data/1664237/0001664237-16-000001.txt edgar/data/1664513/0001664513-16-000001.txt edgar/data/1665711/0001665711-16-000001.txt edgar/data/1665354/0001665354-16-000001.txt edgar/data/1664635/0001664635-16-000001.txt edgar/data/1625109/0001625109-16-000002.txt edgar/data/1658659/0001658659-16-000002.txt edgar/data/1666635/0001666635-16-000001.txt edgar/data/1614102/0001614102-16-000002.txt edgar/data/1665218/0001665218-16-000001.txt edgar/data/1663921/0000905729-16-000383.txt edgar/data/1666561/0001666561-16-000002.txt edgar/data/1668972/0001668972-16-000002.txt edgar/data/1540531/0000905718-16-001186.txt edgar/data/1540531/0000905718-16-001254.txt }; my $file_count=0; my $FH_OUT=\STDOUT; my @fields=qw/cik form_type report_date file_date name/; foreach my $filetoget(@aonly) { my $res=get_process_trunc ($filetoget); if (scalar(keys(%$res))) { my $lineout=''; for my $field (@fields){ if ($res->{$field}) {$lineout.=$res->{$field}} $lineout.='\|'; } print $FH_OUT $lineout."\n"; } } exit; sub get_process_trunc { # http://www.perlmonks.org/?node_id=1183107 my $filetoget=shift; my $fullfile="$base_url/$filetoget"; my $res={}; use LWP::UserAgent; my $received_size = 0; my $partial = ''; my $ua = LWP::UserAgent->new; my $response = $ua->get($fullfile , ':content_cb'=> sub { my ($data, $response, $protocol) = @_; $partial.=$data; $received_size += length $data; + die if ($received_size>10000); # die inside this callback interrupt th +e request, not the program!! } ); if ($partial) { # print 'length:'.length($partial)."\n"; my $line_count=0; for my $line (split qr/\'\n'/, $partial) { if($line=~m/^\sCENTRAL\sINDEX\sKEY:\s(\d)/m) +{$res->{cik} =$1;} if($line=~m/^\sFORM\sTYPE:\s(.$)/m) +{$res->{form_type} =$1;} if($line=~m/^\sCONFORMED\sPERIOD\sOF\sREPORT:\s(\d)/m) +{$res->{report_date}=$1;} if($line=~m/^\sFILED\sAS\sOF\sDATE:\s(\d)/m) +{$res->{file_date} =$1;} if($line=~m/^\sCOMPANY\sCONFORMED\sNAME:\s(.*$)/m) +{$res->{name} =$1;} $line_count++; last if ($line_count>50); } } # success return $res; } # get_process_trunc [download] result 0001394872\|4\|20160119\|20160121\|Ambient Water Corp\| 0001394872\|5\|20150422\|20160209\|Ambient Water Corp\| 0001084869\|10-Q\|20151227\|20160205\|1 800 FLOWERS COM INC\| 0001084869\|4\|20160202\|20160331\|1 800 FLOWERS COM INC\| 0001084869\|4\|20160329\|20160331\|1 800 FLOWERS COM INC\| 0001084869\|4\|20160202\|20160331\|1 800 FLOWERS COM INC\| 0001084869\|SC 13G/A\|\|20160216\|1 800 FLOWERS COM INC\| 0001511144\|13F-HR\|20151231\|20160212\|10-15 ASSOCIATES, INC.\| 0001665022\|D\|\|20160127\|10-20 Channel Center REIT\| 0001662965\|D\|\|20160106\|100 Wall Investments LLC\| 0001651654\|D/A\|\|20160127\|100INSIGHTS, INC\| 0001664237\|D\|\|20160212\|1075 Weybridge Holding, LLC\| 0001664513\|D\|\|20160205\|10Stories, Inc.\| 0001665711\|D\|\|20160210\|11 Madison Investor II LLC\| 0001665354\|D\|\|20160210\|11 Madison Investor LLC\| 0001664635\|D\|\|20160223\|11 Pine, Inc.\| 0001625109\|D\|\|20160226\|11 Roniin, LLC\| 0001658659\|D/A\|\|20160113\|110 Corcoran Property Partners, LLC\| 0001666635\|D\|\|20160210\|1111 Broadway Distribution, LLC\| 0001614102\|D\|\|20160301\|1125 North Fairfax LLC\| 0001665218\|D\|\|20160128\|114 REIT LP\| 0001663921\|D\|\|20160126\|1143 Highland Drive, LLC\| 0001666561\|D\|\|20160211\|1155 Boulder, LLC\| 0001668972\|D\|\|20160323\|11619 Euclid, LLC\| 0001540531\|13F-HR\|20151231\|20160216\|12 West Capital Management LP\| 0001386301\|SC 13G\|\|20160307\|Research Solutions, Inc.\| [download] I thought i recognized this type of data notice reference to @Discipulus at Re: Split web page, first 30 lines only -- :content_cb trick	[reply] [d/l] [select]
Re^4: Getstore to avoid of memory? by huck (Prior) on Mar 02, 2017 at 02:25 UTC
Re^5: Getstore to avoid of memory? by wrkrbeee (Scribe) on Mar 02, 2017 at 14:01 UTC
Re^3: Getstore to avoid of memory? by Anonymous Monk on Mar 02, 2017 at 00:26 UTC
You know how you have `sub trim`? You need about 5-10 more of those, all those big blocks of code need to be subs	[reply] [d/l]


We don't bite newbies here... much
	PerlMonks