note
kevyt
<p>
Ken,
Thanks very much for your help.
It's working great but I forgot about one issue. They might add a "R-1" or "R-2" to the far left column if there is a revision. I have not used perl much since 2006 and I rarely used regex. I also tried to get some of the comments but that wont be importing going forward.
Example with R-1
https://contractorconnection.gpo.gov/abstract/777292
Example without R-1
https://contractorconnection.gpo.gov/abstract/777293
I also need to install CAM::PDF so I can run it on linux.
</p>
<code>
#!/usr/bin/perl -w
# use warnings;
# use strict;
use CAM::PDF;
use LWP::Simple;
use Data::Dumper;
use constant {
AMOUNT => 0,
ADDL_RATE_PER => 0,
DISCOUNT_PRICE => 0,
};
#### These will be used to load different database tables #####
$companies = 'c:\Users\Kevin\Documents\dev\data_files\gpo_companies.csv';
$bids = 'c:\Users\Kevin\Documents\dev\data_files\gpo_bids.csv';
$awards = 'c:\Users\Kevin\Documents\dev\data_files\gpo_awards.csv';
$solicit = 'c:\Users\Kevin\Documents\dev\data_files\gpo_solicitations.csv';
$log_file = 'c:\Users\Kevin\Documents\dev\data_files\gpo_log.csv';
#### This file will be imported into excel (temp. solution so I won't have to create the db tables now)
$all_file = 'c:\Users\Kevin\Documents\dev\data_files\gpo_abstract_data.csv';
open (COMPANY, ">> $companies") or die ("Can't open the output file $!");
open (BID, ">> $bids") or die ("Can't open the output file $!");
open (AWARD, ">> $awards") or die ("Can't open the output file $!");
open (SOLICIT, ">> $solicit") or die ("Can't open the output file $!");
open (LOG, ">> $log_file") or die ("Can't open the output file $!");
open (OUT, ">> $all_file") or die ("Can't open the output file $!");
print OUT "Jacket_ID,Award,Contractor_Code,Company_Name,Amount,Addl_Rate,Addl_Rate_Per,Discount_Percent,Discount_Days,Discount_Price,Bidders_Name,Date_Received,Phone_Number\n";
my @ns_headers = (
'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
my $jacket_id = 777390; # Get the most recent data first
while ($jacket_id > 700000){
sleep (2);
$jacket_id --;
my $ua = LWP::UserAgent->new;
$ua->timeout(5); # Is the site available?
print $jacket_id . "\n";
my $response = $ua->get('https://contractorconnection.gpo.gov/abstract/'. $jacket_id , @ns_headers);
if ( $response =~ /Abstract Unavailable/){
print LOG $jacket_id . ",Unavailable\n";
next;
}
my $pdf = CAM::PDF->new($response->content) || print LOG $jacket_id . ",ERROR,\n". next;
my $re = qr{(?x:
\A
\s*?
((?:A|)) # Awd - 0
\s+
(\d+-\d+) # Contractor Code - 1
\s+
([^\$]+?) # Name - 2
\s+
(\$\s[0-9,.]+) # Amount - 3
\s+
### (\$\s[0-9,.]+\s[A-Z]) # Add'l Rate/PER - 4
(\$\s[0-9,.]+) # Add'l Rate - 4
\s+
([^\$]+?) # Add'l Rate's Per - 5
\s+
### ([0-9.]+\s+\d+) # Discount % Days - 6
([0-9.]+) # Discount % - 6
\s+
(\d+) # Discount Days - 7
\s+
(\$\s[0-9,.]+) # Discount Price - 8
\s+
([\D]+?) # Bidders Name - 9
\s+
(\S+) # Date Received - 10
\s+
(\(\d+\)\s\d+-\d+) # Phone Number - 11
)};
for my $page_num (1 .. $pdf->numPages) {
my $text = $pdf->getPageText($page_num);
my @lines;
my $wanted_line = 0;
for my $line (split /$jacket_id/, $text) {
# print $line;
next unless $wanted_line++;
my @fields = $line =~ $re;
$fields[AMOUNT] =~ y/ //d;
$fields[ADDL_RATE_PER] =~ s/ //;
$fields[DISCOUNT_PRICE] =~ y/ //d;
$fields[3] =~ s/\s+//g; # Remove the space between the $ and digit
$fields[4] =~ s/\s+//g; # Remove the space between the $ and digit
$fields[8] =~ s/\s+//g; # Remove the space between the $ and digit
foreach (@fields){
$_ =~ s/\,//;
}
push @lines, [ $jacket_id, @fields ];
# Contractor Code Company Name Bidders Name Phone Number
print COMPANY $fields[1] . ",". $fields[2] . ",". $fields[9] . ",". $fields[11] . "\n";
# Title Quantity Contact Winning_Contractor
print SOLICIT $jacket_id . ",,,,". $fields[1] . "\n";
if($fields[0] =~ /A/){ # Contractor Code Date Received
print AWARD $jacket_id . ",". $fields[1] . ",". $fields[10] . "\n";
} # Contractor Code Amount Add'l Rate Add'l Rate's Per Discount Days Discount % Discount Price
print BID $jacket_id . ",". $fields[1] . ",". $fields[3] . ",". $fields[4] . ",". $fields[5] . ",". $fields[7] . ",". $fields[6] . ",". $fields[8] . "\n";
print OUT $jacket_id . ",". $fields[0] . ",". $fields[1] . ",". $fields[2] . ",". $fields[3] . ",". $fields[4] . ",". $fields[5] . ",". $fields[6] . ",".
$fields[7] . ",". $fields[8] . ",". $fields[9] . ",". $fields[10] . ",". $fields[11] . "\n";
# foreach my $field (@fields){
# print $field . ",";
# }
# print "\n";
}
# print Dumper(\@lines);
}
} # End while ()
</code>
11113472
11113478