*********** Sample text *************** CONFORMED PERIOD OF REPORT: 20081231 ------ individual line I want FILED AS OF DATE: 20090331 ------ individual line I want DATE AS OF CHANGE: 20090331 ------ individual line I want CENTRAL INDEX KEY: 0000786368 ------ individual line I want FORM TYPE: 10-K ------ individual line I want Whole buncha text here ……………. EX-21 7 v144610_ex21.htm -----------My starting point ************* BODY OF TEXT I WISH TO EXTRACT **************** ----------- My ending point **********End of sample text *********** #!/usr/bin/perl -w use strict; use warnings; use File::stat; use lib "c:/strawberry/perl/site/lib"; #Specify the directory containing the files that you want to read; my $files_dir = 'E:\research\audit fee models\filings\Test'; #Specify the directory containing the results/output; my $write_dir = 'E:\research\audit fee models\filings\filenames\filenames.txt'; #Open the directory containing the files you plan to read; opendir(my $dir_handle, $files_dir) or die "Can't open directory $!"; #Initialize the variable names. my $file_count = 0; my $line_count=0; my $cik=-99; my $form_type=""; my $form=""; my $report_date=-99; my $htm=""; my $url=""; my $slash='/'; my $line_count=0; #Loop for reading each file in the input directory; while (my $filename = readdir($dir_handle)) { next unless -f $files_dir.'/'.$filename; print "Processing $filename\n"; #Open the input file; open my $FH_IN, '<',$files_dir.'/'.$filename or die "Can't open $filename"; #Within the file loop, read each line of the current file; while (my $line = <$FH_IN>) { next unless -f $files_dir.'/'.$filename; if ($line_count > 500000) { last;} #Begin extracting header type data from the file; if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m){$cik=$1; $cik =~ s/^0+//;} if($line=~m/^\s*FORM\s*TYPE:\s*(10k.*$)/im || ($line=~m/^\s*FORM\s*TYPE:\s*(10-k.*$)/im)) {$form_type=$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m){$report_date=$1;} #End of header type information; #Begin block text accumulation; #This REGEX identifies the starting point of the text I wish to accumulate; if($line=~m/^\s*(.*?)(ex21)(.*?)(.htm$)/igm || $line=~m/^\s*(.*?)(EX-21)(.*?)(.htm$)/igm || $line=~m/^\s*(.*?)(ex21)(.*?)(.htm$)/igm || $line=~m/^\s*(.*?)(EX-21)(.*?)(.htm$)/igm) {$htm=join('',$1,$2,$3,$4); } #Something seemingly here that accumulates text, using PUSH, or whatever; #This is the ending point of the text I wish to accumulate; if($line=~m/^\s*/igm; #End block text accumulation; #Update line counter; ++$line_count; }