My apologies, revised/cleaned code appears below:
#!/usr/bin/perl -w
#use strict;
use Tie::File;
use Fcntl;
use LWP::Simple;
#First year you want downloaded files for for:
my $startyear=2016;
#Last year you want files for:
my $endyear=2016;
#First qtr you want files for (usually 1):
my $startqtr=1;
#Last qtr you want files for (usually 4):
my $endqtr=1;
#The directory you want your index files to be stored in.
my $inddirect="C:/Volumes/EDGAR1/Edgar/full-index";
#The directory you are going to download filings to
my $direct="G:/Research/SEC filings 10K and 10Q/Data";
#The file that will contain the filings you want to download.
my $outfile="G:/Research/SEC filings 10K and 10Q/Data/sizefiles1.txt";
#Specify the directory containing the results/output;
my $write_dir = 'G:\Research\SEC filings 10K and 10Q\Data\Header Data\
+data2016.txt';
my $base_url = 'http://www.sec.gov/Archives';
my $line_count=0;
my $cik=-99;
my $form_type="";
my $report_date=-99;
my $file_date=-99;
my $name="";
my $count=0;
#Initialize file counter variable;
my $file_count = 0;
my $formget1='(10-K )';
my $formget2='(10-K405 )';
my $formget3='(10KSB )';
my $formget4='(10-KSB )';
my $formget5='(10KSB40 )';
my $formget6='(10-KT )';
my $formget7='(10KT405 )';
my $slash='/';
for($yr=$startyear;$yr<=$endyear;$yr++)
{
#loop through all the index quarters you specified
if($yr<$endyear){$eqtr=4}else{$eqtr=$endqtr}
for($qtr=$startqtr;$qtr<=$eqtr;$qtr++)
{
#Open the index file
open(INPUT, "$inddirect/company$qtr$yr.idx") || die "file for company$
+qtr$yr.idx: $!";
#Open the file you want to write to. The first time through
#the file is opened to "replace" the existing file.
#After that, it is opened to append ">>".
if ($yr==$startyear && $qtr==$startqtr)
{$outfiler=">$outfile";}
else{$outfiler=">>$outfile";}
open(OUTPUT, "$outfiler") || die "file for 2006 1: $!";
$count=1;
while ($line=<INPUT>)
{
#ignore the first 10 lines because they only contain header informatio
+n
if ($.<11) {next};
$company_name=substr($line,0,60);
$form_type=substr($line,62,12);
my $cik=substr($line,74,10);
$file_date=substr($line,86,10);
$file_date=~s/\-//g;
my $fullfilename=trim(substr($line,98,43));
if ($form_type=~/^$formget1(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
$count++;
print $line_count, " ", $form_type, " ", $base_url,"/",$fullfilena
+me,"\n";
}
elsif ($form_type=~/^$formget2(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
$count++;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget3(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
$count++;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget4(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
$count++;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\n
+";
}
elsif ($form_type=~/^$formget5(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
$count++;
}
elsif ($form_type=~/^$formget6(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
$count++;
}
elsif ($form_type=~/^$formget7(?!\/)/)
{
print OUTPUT "$fullfilename\n" ;
print $count, " ", $form_type, " ", $base_url,"/",$fullfilename,"\
+n";
$count++;
}
}
close(INPUT);
close(OUTPUT);
# check to see if directory exists. If not, create it.
unless(-d "$direct$slash$yr"){
mkdir("$direct$slash$yr") or die;
}
#Open the directory and get put the names of all files into the array
+@old
opendir(DIR,"$direct$slash$yr")||die "Can't open directory";
@Old=readdir(DIR);
#The tie statement assigns the file containing the
#files you want to download to the array @New1.
tie(@New1,Tie::File,"$outfile", mode=> O_RDWR)
or die "Cannot tie file BOO: $!n";
#checks to see what files on the current index listing are not in the
+directory
#defines a hash called seen.
%seen=();
#defines an array called @aonly.
@aonly=();
#build lookup table. This step is building a lookup table(hash).
#each filename (from OLD) has a value of 1 assigned to it.
foreach $item(@Old){$seen{$item}=1}
#for each item in the New1 array, which we got from the txt file
#containing all the files we want to download, add
#it to the array, @aonly, as long is it is not already
#in the current directory. We do this so we don't download
#a file we have already downloaded.
foreach $item(@New1){
$item=~/(edgar\/data\/.*\/)(.*\.txt)/;
unless($seen{$item}){
push(@aonly,$item);
}
}
#downloads all the files in the @oanly array which are the files not i
+n the directory
foreach $filetoget(@aonly)
{
$fullfile="$base_url/$filetoget";
$fonly=$filetoget;
for my $line (split qr/\'\n'/, get($fullfile))
{
while ($line_count < 2) {
if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1;}
if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m){$form_type=$1;}
if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$
+report_date=$1;}
if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m){$file_date=
+$1;}
if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m){$name=$
+1;}
$line_count++;
print "$cik, $form_type, $report_date, $file_date, $name\n";
print "$line_count\n";
} # end of while loop;
} # end of for my $line split loop;
### Now write the results to file!;
#Open the ouput file;
open my $FH_OUT, '>>',$write_dir or die "Can't open file $write_dir";
#Save/write results/output;
$,='|';
print $FH_OUT "$cik$,$form_type$,$report_date$,$file_date$,$name$,\n";
$line_count=0;
#Update file counter;
++$file_count;
print "$file_count lines read from $fullfile\n";
#closedir($dir_handle);
close($FH_OUT);
} # end of foreach file to get loop;
#end of qtr loop
}
#end of year loop
}
sub trim {
my $new_phrase;
my $phrase = shift(@_);
$phrase =~ s/^\s+//;
$phrase =~ s/\s+$//;
$new_phrase = "$phrase";
return "$new_phrase";
}
|