#!/perl/bin/perl
#
# ean2bib.pl -- Application converts EANs to BibTeX data.
use strict;
use warnings;
use diagnostics;
use Getopt::Long;
use Pod::Usage;
use LWP::Simple;
use XML::Simple;
our $VERSION = '0.02';
my $devtok = $ENV{'AMAZON_DEVTOK'};
my $simple = XML::Simple->new();
my %bib = (
author => '?',
title => '?',
publisher => '?',
address => '?',
edition => '?',
year => '?',
ISBN => '?',
image => '?',
);
my $isbn_list;
my %lookup;
my @isbns;
my $isbn;
GetOptions(
'debug=i' => \( my $debug = 0 ),
'help|?' => \( my $opt_help ),
'man' => \( my $opt_man ),
'version' => \( my $opt_version ),
'lookup=s' => \( my $opt_lookup ),
'isbn' => \( my $opt_isbn ),
'output' => \( my $opt_output ),
'noappend' => \( my $opt_noappend ),
'location=s' => \( my $opt_location ),
'image' => \( my $opt_image ),
'nolist' => \( my $opt_nolist ),
'authors' => \( my $opt_authors ),
) or pod2usage(2);
if ($opt_version) {
print "ean2bib.pl vrs. $VERSION\n";
exit;
}
pod2usage(1) if $opt_help;
pod2usage( -verbose => 2 ) if $opt_man;
if ($debug) {
*OUTPUT = *STDOUT;
}
else {
if ($opt_output) {
if ($opt_noappend) {
open ( OUTPUT , ">$opt_output" ) or die "Can't open $opt_output: $!\n";
}
else {
open ( OUTPUT , ">>$opt_output" ) or die "Can't open $opt_output: $!\n";
}
}
else {
if ($opt_noappend) {
open ( OUTPUT , ">ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n";
}
else {
open ( OUTPUT , ">>ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n";
}
}
}
if ($opt_lookup) {
open( ADDRESS, $opt_lookup ) or die "Can't open $opt_lookup: $!\n";
while (
) {
chomp;
my ($publisher,$address) = split(/\|/,$_);
$lookup{$publisher} = $address;
}
close ( ADDRESS );
}
if ($debug) {
while () {
chomp;
push ( @isbns, ean2isbn($_) );
}
}
elsif (@ARGV == 0) {
if ($opt_isbn) {
print "Enter ISBN+newline---^Z to end. ^C to cancel.\n";
while (<>) {
chomp;
my ( $ok, $checkdigit ) = checkISBN($_);
if ($ok) {
push ( @isbns, $_ );
}
else {
if ($checkdigit eq '-') {
print "Too few characters in ISBN, 10 required.\n";
}
elsif ($checkdigit eq '+') {
print "Too many characters in ISBN, 10 required.\n";
}
else {
print "Bad check digit in ISBN, one or more characters wrong.\n"
}
}
}
}
else {
print "Enter EAN+newline---^Z to end. ^C to cancel\n";
while (<>) {
chomp;
push ( @isbns, ean2isbn($_) );
}
}
}
else {
for (map { glob } @ARGV) {
filter($_);
}
}
unless ($opt_nolist) {
if ($opt_output) {
open ( FILE, ">$opt_output.isbn") or die "Couldn't open $opt_output.isbn: $!\n";
}
else {
open ( FILE, ">ean2bib.bib.isbn") or die "Couldn't open ean2bib.bib.isbn: $!\n";
}
for (@isbns) {
print FILE $_,"\n";
}
close (FILE);
}
while (@isbns) {
$isbn_list = join ( ",",splice(@isbns,0,30) );
my $Amazon_webpage = get("http://xml.amazon.com/onca/xml2?t=webservices-20&dev-t=$devtok&AsinSearch=$isbn_list&type=lite&f=xml");
if ( $Amazon_webpage ) {
my $root = $simple->XMLin( $Amazon_webpage );
my $details = $root->{'Details'};
for (@$details) {
for ( keys %bib ) {
$bib{$_} = '?';
}
ParseAmazon( $_, \%bib );
print OUTPUT "\@book{,\n";
print OUTPUT " author=\{", $bib{'author'}, "\}\n";
print OUTPUT " title=\{", $bib{'title'}, "\}\n";
print OUTPUT " publisher=\{", $bib{'publisher'}, "\}\n";
print OUTPUT " address=\{", $bib{'address'}, "\}\n";
print OUTPUT " year=\{", $bib{'year'}, "\}\n";
print OUTPUT " ISBN=\{", $bib{'ISBN'}, "\}\n";
if ($opt_location) {
print OUTPUT " location=\{",$opt_location,"\}\n";
}
if ($opt_image) {
print OUTPUT " image=\{",$bib{'image'},"\}\n";
}
print OUTPUT "}\n\n";
}
}
else {
print "Bad luck this time\n";
}
}
close ( OUTPUT );
sub filter {
my $file = shift;
open ( FILE, $file ) or die "Couldn't open $file: $!\n";
print STDERR "$file:\n";
while () {
chomp;
if ($opt_isbn) {
my ( $ok, $checkdigit ) = checkISBN($_);
if ($ok) {
push ( @isbns, $_ );
}
else {
if ($checkdigit eq '-') {
print STDERR "Too few characters in ISBN '$_', 10 required.\n";
}
elsif ($checkdigit eq '+') {
print STDERR "Too many characters in ISBN '$_', 10 required.\n";
}
else {
print STDERR "Bad check digit in ISBN '$_', one or more characters wrong.\n"
}
}
}
else {
push ( @isbns, ean2isbn($_) );
}
}
close ( FILE );
}
sub ParseAmazon {
my ( $detail, $bib ) = @_;
$bib->{'title'} = clean( $detail->{'ProductName'} );
$bib->{'publisher'} = clean( $detail->{'Manufacturer'} );
$bib->{'ISBN'} = clean( $detail->{'Asin'} );
$bib->{'image'} = $detail->{'ImageUrlSmall'};
my $year = $detail->{'ReleaseDate'};
$year =~ s/.*? (\d+)$/$1/;
$bib->{'year'} = $year;
my $authors = $detail->{'Authors'}->{'Author'};
my @list;
if ( ref($authors) eq 'ARRAY' ) {
for (@$authors) {
push ( @list, $_ );
}
unless ($opt_authors) {
$bib->{'author'} = clean( join ( " and ", @list ) );
}
}
else {
$bib->{'author'} = clean($authors);
}
my $webpage = get("http://lcweb.loc.gov/cgi-bin/zclient?host=z3950.loc.gov&port=7090&attrset=BIB1&rtype=USMARC&DisplayRecordSyntax=HTML&ESN=F&startrec=1&maxrecords=10&dbname=Voyager&srchtype=1,7,2,3,3,1,4,1,5,1,6,1&term_term_1=$bib->{'ISBN'}");
scrapeLOC($webpage,$bib);
if ($bib->{'address'} eq '?' and $opt_lookup) {
if (exists($lookup{$bib{'publisher'}})) {
$bib->{'address'} = $lookup{$bib{'publisher'}};
}
}
if ($bib->{'author'} eq '?' and scalar(@list)) {
$bib->{'author'} = clean( join ( " and ", @list ) );
}
}
sub GetSubfield {
my ( $subfield, $key ) = @_;
if ( ref($subfield) eq 'ARRAY' ) {
for (@$subfield) {
if ( $_->{'code'} eq $key ) {
return clean( $_->{'content'} );
}
}
}
else {
return clean( $subfield->{'content'} );
}
}
sub scrapeLOC{
my ( $webpage, $bib ) = @_;
if ($webpage and $webpage !~ /0<\/b> records/) {
my @list;
my %fields;
while ($webpage =~ /^(.*?:)/mg) {
push(@list,$1);
}
for (0..@list - 2) {
$fields{$list[$_]} = $list[$_ + 1];
}
$fields{$list[-1]} = '<\/PRE>';
if (exists($fields{'Title:'})) {
my $title = parsespan('Title:',$fields{'Title:'},$webpage);
my @title = split(/\//,$title);
for (@title) {
$_ = alltrim($_);
}
$title = $title[0];
my $author = $title[1];
$author =~ s/,/ and/g;
$author =~ s/\.$//;
$bib->{'title'} = $title if ($title and $bib->{'title'} eq '?');
$bib->{'author'} = $author if ($author and $bib->{'author'} eq '?');
}
if (exists($fields{'Author:'}) and $bib->{'author'} eq '?') {
my $author = parsespan('Author:',$fields{'Author:'},$webpage);
$author =~ s/\.$//;
$bib->{'author'} = $author if $author;
}
if (exists($fields{'Edition:'}) and $bib->{'edition'} eq '?') {
my $edition = parsespan('Edition:',$fields{'Edition:'},$webpage);
$edition =~ s/(.*?)\s.*$/$1/;
$bib->{'edition'} = $edition if $edition;
}
if (exists($fields{'ISBN:'}) and $bib->{'ISBN'} eq '?') {
my $isbn = parsespan('ISBN:',$fields{'ISBN:'},$webpage);
$isbn =~ s/.*?(\d{9}\d|X).*/$1/;
$bib->{'ISBN'} = $isbn;
}
if (exists($fields{'Published:'})) {
my $published = parsespan('Published:',$fields{'Published:'},$webpage);
$published =~ /(.*?)\s:\s(.*?),.*?(\d{4}).*$/;
$bib->{'publisher'} = $2 if ($2 and $bib->{'publisher'} eq '?');
$bib->{'address'} = alltrim($1) if ($1 and $bib->{'address'} eq '?');
$bib->{'year'} = $3 if ($3 and $bib->{'year'} eq '?');
}
}
}
sub parsespan {
my ($begin,$end,$s) = @_;
$s =~ /$begin((?:(?!$begin).)*)$end/ms;
my $span = $1;
$span =~ s/\n/ /g;
$span =~ s/\s\s+/ /g;
return $span;
}
sub alltrim {
my $s = shift;
$s =~ s/^\s+//;
$s =~ s/\s+$//;
return $s;
}
sub ean2isbn {
my $isbn = substr( shift, 3, 10 );
return substr($isbn,0,9) . checkDigit($isbn);
}
sub checkISBN {
my $isbn = shift;
my $n = length($isbn);
if ( $n != 10 ) {
return ( 0, ( $n < 10 ? '-' : '+' ) );
}
else {
my $cd = checkDigit($isbn);
return ((($cd eq substr($isbn,-1,1)) ? 1 : 0), $cd);
}
}
sub checkDigit {
my @digits = split ( //, uc(shift) );
my $sum = 0;
my $m = 10;
for ( 0 .. @digits - 2 ) {
$sum += $digits[$_] * $m--;
}
return qw(0 X 9 8 7 6 5 4 3 2 1) [ $sum % 11 ];
}
sub clean {
my $s = shift;
$s =~ s/[,:\\\/.]$//;
$s =~ s/\s+$//;
$s =~ s/[\x7f-\xff]//g;
return $s;
}
=head1 NAME
ean2bib.pl -- Application to create BibTeX data from EANs.
=head1 SYNOPSIS
ean2bib.pl [options] filespec
Options:
-debug set debug level, default is off
-help brief help message
-man full documentation
-version version number
-lookup publisher address lookup file
-isbn ISBN entry versus EAN entry, default is EAN entry
-output name of file to write bib entries to, defaults to 'ean2bib.bib'
-noappend turn append mode for output ON or OFF, defaults to OFF (do append)
-location add 'location' tag to bib entry.
-image add 'image' tag to bib entry using Amazon's small image
-nolist suppress backup of ISBNs, defaults to OFF (do backup list)
-authors use Library of Congress lookup instead of Amazon for 'authors' field
Switches that don't define a value can be done in long or short form.
eg:
ean2bib.pl --man
ean2bib.pl -m
=head1 OPTIONS
=over 8
=item B<-debug>
Display debug information as program is executed. Control is set by level of the value
passed on the command line. Default value is off (debug == 0). Setting this to something
greateer than zero will also send the output to STDOUT overriding the normal defaults.
Skips appending etc.
=item B<-help>
Print a brief help message and exit.
=item B<-man>
Print the manual page (full documentation) and exit.
=item B<-version>
Print the version number and exit.
=item B<-lookup>
Specify a 'lookup' file for publisher addresses.
=item B<-isbn>
Flag to allow ISBN entry versus EAN entry. Default is EAN entry.
=item B<-output>
Name of file to write output to. Default is 'ean2bib.bib'.
=item B<-noappend>
Flag to turn off append mode for output file. Default is OFF (do append).
=item B<-loction>
Turn on 'location' tag for bib entry.
=item B<-image>
Turn on 'image' tag of bib entry. Currently 'small' image is hardcoded in.
=item B<-nolist>
Suppress backup of ISBNs, defaults to OFF (do backup list).
=item B<-authors>
Note AMAZON currently has a bug in that they think it is alright to list
'Vic Broquard', 'Broquard Vic', and 'Victor E. Broquard' as multiple authors!
Using -authors forces a LOC lookup instead of accepting the AMAZON lookup for
those cases of multiple authorship. Does not affect single authorship lookup.
=back
=head1 DESCRIPTION
This application uses the AMAZON web services API and the Library of Congress
MARC database to create a BibTeX entry based solely on the EAN taken either from
text files specified on the command line or entered by way of a bar code wand at
the command line.
=head2 EXAMPLE
Suppose you had a file of EANs, say eans.txt that contained:
9780446611336
9780451458711
9780446610902
9780596004361
9780201185379
9780201489460
9780764545696
9780138482763
One EAN per line. Then say you typed 'ean2bib eans.txt' at the command line. The
result would be a the creation of a file called 'ean2bib.bib.isbn' and another
called 'ean2bib.bib'. The '.bib' file would be appended to if it already existed. On the
first run however, it would look like (shorted slightly for POD):
@book{,
author={Rebecca Neason}
title={The Truest Power}
publisher={Warner Books}
address={?}
year={2002}
ISBN={0446611336}
}
@book{,
author={Alan F. Troop}
title={The Dragon Delasangre}
publisher={Roc}
address={New York}
year={2002}
ISBN={0451458710}
}
@book{,
author={Katherine Kurtz}
title={Crusade of Fire: Mystical Tales of the Knights Templar}
publisher={Warner Books}
address={?}
year={2002}
ISBN={0446610909}
}
@book{,
author={Peter Prinz and Ulla Kirch-Prinz}
title={C Pocket Reference}
publisher={O'Reilly & Associates}
address={?}
year={2002}
ISBN={0596004362}
}
.
.
.
@book{,
author={Vic Broquard and Broquard Vic and Victor E. Broquard}
title={Intermediate Mfc}
publisher={Prentice Hall PTR}
address={Upper Saddle River, NJ}
year={1998}
ISBN={0138482764}
}
In other words, the output is suitable as input into a LaTeX document as a .bib file.
Noticeable is the occasional 'address={?}' entry. This is because of two things; first,
Amazon doesn't supply publisher address information. Second, while the library of Congress
is better, sometimes it doesn't either! Because of this, the --lookup option was added.
This option supplies a text file to use as a address lookup table backstop. As an
example:
A. H. Baily and Co.|London, UK
Ace Books|New York, NY
Ace|New York, NY
ActiveState Tool Corp.|Vancouver, Canada, BC
Addison Wesley|Boston, MA
Addison Wesley|Reading, MA
Addison-Wesley|Upper Saddle River, NJ
Aeonian Press|Mattituck, NY
Al. Kalmajs Prtg. Co.|Chicago, Illinois
Aladdin|New York, NY
Albert E. Woolum|N. Richland Hills, TX
Alfred Kalnajs and Son|Gulfport, FL
Alfreds Kalnajs|Chicago, Illinois
American Chess Promotions|Macon, GA
American Chess Promotion|Macon, GA
American Chess Quarterly|Columbus, OH
Andreyevski Flag|Himberg Austria
Arbor House|New York, NY
Aspect|New York, NY
Atheneum|New York, NY
Atria|New York, NY
AvoNova|New York, NY
Avon Books (Trd)|New York, NY
Avon|New York, NY
Forms the first few lines of my 'address.txt' file. The format is simple; publisher
name followed by publisher address, delimited by '|'.
Also noticeable is the 'author={Vic Broquard and Broquard Vic and Victor E. Broquard} from
the last EAN in the example. Amazon currently thinks this is a good thing! I think
it is a bug, but Amazon is bigger than I am. At any rate, I've added the --authors option
to fix this. Briefly it skips the author information from Amazon in favor of the author
information from the Library of Congress lookup.
=head2 WHAT IS LATEX?
LaTeX is a high-quality typesetting system, with features designed for the
production of technical and scientific documentation. LaTeX is the de facto
standard for the communication and publication of scientific documents.
=head2 WHAT IS BIBTEX?
From http://www.santafe.edu/~vince/MacBibTeX.html:
BibTeX is the bibliography handling tool related to the TeX/LaTeX typesetting
system (available on almost all conceivable OS platforms). When a document is
typeset with LaTeX, an auxiliary file (with extension '.aux') is generated.
This auxiliary file contains, amongst other things, a list of references (to
books, articles, research reports, web-url's,...) cited in the original document.
BibTeX takes this list, together with the name of a style file (extension '.bst')
and a list of bibliography databases (extension '.bib') which are also given
in the auxiliary file, and produces a wonderfully formatted list of references
which the LaTeX system subsequently (and automatically) appends to your typeset
document. For those who think this sounds a bit too complicated, the benefits in
terms of quality of output (both textual and mathematical) and automatic generation
of all manner of numbering schemes (figures, equations, sections), and document
parts (table of contents, index, bibliography), together with cross-platform
availability, mean that many people consider it the _best_ current system for
producing quality documents.
=head2 WHAT IS A BIB FILE?
From http://pax.st.usm.edu/~kolibal/tex_html/bib_html/bib.html:
There are several approaches to including a bibliography into a LaTeX document. The
easiest, but not the best is to construct a bbl file. The bbl file, of FILE.bbl
where file is the name of the LaTeX file simply contains the command LaTeX
interprets to write out the bibliography. The difficulty with a bbl file is
that it is rigid. Once constructed, in order to change the formatting or ordering,
manually rewriting the file is the only option available.
Instead, the preferred mechanism for producing a bibliography is to construct a bib
file, i.e., a file, usually with the name FILE.bib. The bib file is a database
file, which can be processed by the utility bibtex to produce a bbl file. The
advantage is that the formatting of the bbl file is then determined by a style
template, consequently the bibliography can easily be reforatted to meet the
requirements of a particular publication.
=head2 WHAT IS A EAN?
EAN stands for European Article Number and in this
context refers to the Bookland EAN, a thirteen digit, strictly numeric bar code for
the publishing industry. It is derived from the ISBN number assigned to a particular
title.
=head1 AUTHOR
Hugh S. Myers
hsmyers@sdragons.com
=head1 BUGS
None that I know of.
=head1 TODO
Mon May 26 10:51:13 2003 Add LCCN option for older books.
Mon May 26 11:08:04 2003 Convert file entry and command line entry to use same code.
Mon May 26 12:55:47 2003 Correct flaw in logic for checking EANs. DONE
Mon May 26 13:50:10 2003 Add 'image' option for Amazon thumbnail image URLs. DONE
Fri Jun 06 09:21:52 2003 Drop VBZOOMC.ZoomFactory in favor of a web scrape. DONE
Fri Jun 06 09:23:00 2003 Add CS1504 option to set barcode type and location info.
Fri Jun 06 09:45:21 2003 Begin adding debug features for testing. DONE
Fri Jun 06 11:16:20 2003 Add 'authors' option as a Amazon bug work-around. DONE
=head1 UPDATES
0.02 Removes dependancy on ActiveX .dll. Improve documentation.
=cut
__DATA__
9780446611336
9780451458711
9780446610902
9780596004361
9780201185379
9780201489460
9780764545696
9780138482763