I've had issues with running SOAP::Lite and connecting to Google's Search API, so I've looked for some alternate solutions. I found this NoXML solution (from HACKS,
http://hacks.oreilly.com/pub/h/174) that does not use SOAP::Lite for talking with Google's API. (Funny how it is named NoXML but still uses XML::Simple...) I think the code is close to being right, but it is giving me an error. It was giving me a 411 Length Required error, so I added the content_length to the request, but it doesn't seem to be working right as I now get this error:
parsing error: org.xml.sax.SAXParseException: Premature end of file.
Am I not calculating the Content-length right or is there another issue happening here?
# NoXML.pm
# NoXML [pronounced "no xml"] is a dire-need drop-in
# replacement for SOAP::Lite designed for Google Web API hacking.
package NoXML;
use strict;
no strict "refs";
# LWP for making HTTP requests, XML for parsing Google SOAP
use LWP::UserAgent;
use XML::Simple;
# Create a new NoXML
sub new {
my $self = {};
bless($self);
return $self;
}
# Replacement for the SOAP::Lite-based doGoogleSearch method
sub doGoogleSearch {
my($self, %args);
($self, @args{qw/ key q start maxResults filter restrict
safeSearch lr ie oe /}) = @_;
# grab SOAP request from _ _DATA_ _
my $tell = tell(DATA);
my $soap_request = join '', ;
seek(DATA, $tell, 0);
$soap_request =~ s/\$(\w+)/$args{$1}/ge; #interpolate variables
# Make (POST) a SOAP-based request to Google
my $ua = LWP::UserAgent->new;
my $req = HTTP::Request->new(POST => 'http://api.google.com/search/b
+eta2');
$req->content_type('text/xml');
my $leng = length($soap_request);
$req->content_length($leng);
$req->content($soap_request);
my $res = $ua->request($req);
my $soap_response = $res->as_string;
# Drop the HTTP headers and so forth until the initial xml element
$soap_response =~ s/^.+?(<\?xml)/$1/migs;
# Drop element namespaces for tolerance of future prefix changes
$soap_response =~ s!(<\/?)[\w-]+?:([\w-]+?)!$1$2!g;
# Set up a return dataset
my $return;
# Unescape escaped HTML in the resultset
my %unescape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"', '''=>"'
+");
my $unescape_re = join '|' => keys %unescape;
# Divide the SOAP response into the results and other metadata
my($before, $results, $after) = $soap_response =~
m#(^.+)(.+?)(.+$)#migs ;
my $before_and_after = $before . $after;
# Glean as much metadata as possible (while being somewhat lazy ;-)
while ($before_and_after =~ m#([^<]*?)<#migs) {
$return->{$1} = $3; # pack the metadata into the return dataset
}
# Glean the results
my @results;
while ($results =~ m#(.+?)#migs) {
my $item = $1;
my $pairs = {};
while ( $item =~ m#([^<]*)#migs ) {
my($element, $value) = ($1, $2);
$value =~ s/($unescape_re)/$unescape{$1}/g;
$pairs->{$element} = $value;
}
push @results, $pairs;
}
# Pack the results into the return dataset
$return->{resultElements} = \@results;
# Return nice, clean, usable results
return $return;
}
1;
# This is the SOAP message template sent to api.google.com. Variables
# signified with $variablename are replaced by the values of their
# counterparts sent to the doGoogleSearch subroutine.
__DATA__
<?xml version='1.0' encoding='UTF-8'?>
<SOAP-ENV:Envelope
xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsi="http://www.w3.org/1999/XMLSchema-instance"
xmlns:xsd="http://www.w3.org/1999/XMLSchema">
<SOAP-ENV:Body>
<ns1:doGoogleSearch xmlns:ns1="urn:GoogleSearch"
SOAP-ENV:encodingStyle="http://schemas.xmlsoap.org/soap/encoding/"
+>
<key xsi:type="xsd:string">$key</key>
<q xsi:type="xsd:string">$q</q>
<start xsi:type="xsd:int">$start</start>
<maxResults xsi:type="xsd:int">$maxResults</maxResults>
<filter xsi:type="xsd:boolean">$filter</filter>
<restrict xsi:type="xsd:string">$restrict</restrict>
<safeSearch xsi:type="xsd:boolean">$safeSearch</safeSearch>
<lr xsi:type="xsd:string">$lr</lr>
<ie xsi:type="xsd:string">$ie</ie>
<oe xsi:type="xsd:string">$oe</oe>
</ns1:doGoogleSearch>
</SOAP-ENV:Body>
</SOAP-ENV:Envelope>
And then the script to run it is next. I used Data::Dumper to see what the results were really saying:
#!/usr/bin/perl
# noxml_google2csv.pl
# Google Web Search Results via NoXML ("no xml") module
# exported to CSV suitable for import into Excel
# Usage: noxml_google2csv.pl "{query}" [> results.csv]
# Your Google API developer's key
my $google_key='insertyourkeyrighthere';
use strict;
use NoXML;
use Data::Dumper;
$ARGV[0] or die qq{usage: perl noxml_search2csv.pl "{query}"\n};
my $google_search = new NoXML;
my $results = $google_search ->
doGoogleSearch(
$google_key, shift @ARGV, 0, 10, "false",
"", "false", "", "latin1", "latin1"
);
@{$results->{'resultElements'}} or die('No results');
print Dumper(\$results);
print qq{"title","url","snippet"\n};
foreach (@{$results->{'resultElements'}}) {
$_->{title} =~ s!"!""!g; # double escape marks
$_->{snippet} =~ s!"!""!g;
my $output = qq{"$_->{title}","$_->{URL}","$_->{snippet}"\n};
$output =~ s!<.+?>!!g; # drop all html tags
print $output;
}
Edit (holli): Added readmore tags