*********** Sample text ***************
CONFORMED PERIOD OF REPORT: 20081231 ------ individual line I want
FILED AS OF DATE: 20090331 ------ individual line I want
DATE AS OF CHANGE: 20090331 ------ individual line I want
CENTRAL INDEX KEY: 0000786368 ------ individual line I want
FORM TYPE: 10-K ------ individual line I want
Whole buncha text here …………….
EX-21
7
v144610_ex21.htm -----------My starting point
************* BODY OF TEXT I WISH TO EXTRACT ****************
----------- My ending point
**********End of sample text ***********
#!/usr/bin/perl -w
use strict;
use warnings;
use File::stat;
use lib "c:/strawberry/perl/site/lib";
#Specify the directory containing the files that you want to read;
my $files_dir = 'E:\research\audit fee models\filings\Test';
#Specify the directory containing the results/output;
my $write_dir = 'E:\research\audit fee models\filings\filenames\filenames.txt';
#Open the directory containing the files you plan to read;
opendir(my $dir_handle, $files_dir) or die "Can't open directory $!";
#Initialize the variable names.
my $file_count = 0;
my $line_count=0;
my $cik=-99;
my $form_type="";
my $form="";
my $report_date=-99;
my $htm="";
my $url="";
my $slash='/';
my $line_count=0;
#Loop for reading each file in the input directory;
while (my $filename = readdir($dir_handle)) {
next unless -f $files_dir.'/'.$filename;
print "Processing $filename\n";
#Open the input file;
open my $FH_IN, '<',$files_dir.'/'.$filename or die "Can't open $filename";
#Within the file loop, read each line of the current file;
while (my $line = <$FH_IN>) {
next unless -f $files_dir.'/'.$filename;
if ($line_count > 500000) { last;}
#Begin extracting header type data from the file;
if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1; $cik =~ s/^0+//;}
if($line=~m/^\s*FORM\s*TYPE:\s*(10k.*$)/im || ($line=~m/^\s*FORM\s*TYPE:\s*(10-k.*$)/im))
{$form_type=$1;}
if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$report_date=$1;}
#End of header type information;
#Begin block text accumulation;
#This REGEX identifies the starting point of the text I wish to accumulate;
if($line=~m/^\s*(.*?)(ex21)(.*?)(.htm$)/igm ||
$line=~m/^\s*(.*?)(EX-21)(.*?)(.htm$)/igm ||
$line=~m/^\s*(.*?)(ex21)(.*?)(.htm$)/igm ||
$line=~m/^\s*(.*?)(EX-21)(.*?)(.htm$)/igm)
{$htm=join('',$1,$2,$3,$4); }
#Something seemingly here that accumulates text, using PUSH, or whatever;
#This is the ending point of the text I wish to accumulate;
if($line=~m/^\s*/igm;
#End block text accumulation;
#Update line counter;
++$line_count;
}