#!/perl/bin/perl # # ean2bib.pl -- Application converts EANs to BibTeX data. use strict; use warnings; use diagnostics; use Getopt::Long; use Pod::Usage; use LWP::Simple; use XML::Simple; our $VERSION = '0.02'; my $devtok = $ENV{'AMAZON_DEVTOK'}; my $simple = XML::Simple->new(); my %bib = ( author => '?', title => '?', publisher => '?', address => '?', edition => '?', year => '?', ISBN => '?', image => '?', ); my $isbn_list; my %lookup; my @isbns; my $isbn; GetOptions( 'debug=i' => \( my $debug = 0 ), 'help|?' => \( my $opt_help ), 'man' => \( my $opt_man ), 'version' => \( my $opt_version ), 'lookup=s' => \( my $opt_lookup ), 'isbn' => \( my $opt_isbn ), 'output' => \( my $opt_output ), 'noappend' => \( my $opt_noappend ), 'location=s' => \( my $opt_location ), 'image' => \( my $opt_image ), 'nolist' => \( my $opt_nolist ), 'authors' => \( my $opt_authors ), ) or pod2usage(2); if ($opt_version) { print "ean2bib.pl vrs. $VERSION\n"; exit; } pod2usage(1) if $opt_help; pod2usage( -verbose => 2 ) if $opt_man; if ($debug) { *OUTPUT = *STDOUT; } else { if ($opt_output) { if ($opt_noappend) { open ( OUTPUT , ">$opt_output" ) or die "Can't open $opt_output: $!\n"; } else { open ( OUTPUT , ">>$opt_output" ) or die "Can't open $opt_output: $!\n"; } } else { if ($opt_noappend) { open ( OUTPUT , ">ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n"; } else { open ( OUTPUT , ">>ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n"; } } } if ($opt_lookup) { open( ADDRESS, $opt_lookup ) or die "Can't open $opt_lookup: $!\n"; while (
) { chomp; my ($publisher,$address) = split(/\|/,$_); $lookup{$publisher} = $address; } close ( ADDRESS ); } if ($debug) { while () { chomp; push ( @isbns, ean2isbn($_) ); } } elsif (@ARGV == 0) { if ($opt_isbn) { print "Enter ISBN+newline---^Z to end. ^C to cancel.\n"; while (<>) { chomp; my ( $ok, $checkdigit ) = checkISBN($_); if ($ok) { push ( @isbns, $_ ); } else { if ($checkdigit eq '-') { print "Too few characters in ISBN, 10 required.\n"; } elsif ($checkdigit eq '+') { print "Too many characters in ISBN, 10 required.\n"; } else { print "Bad check digit in ISBN, one or more characters wrong.\n" } } } } else { print "Enter EAN+newline---^Z to end. ^C to cancel\n"; while (<>) { chomp; push ( @isbns, ean2isbn($_) ); } } } else { for (map { glob } @ARGV) { filter($_); } } unless ($opt_nolist) { if ($opt_output) { open ( FILE, ">$opt_output.isbn") or die "Couldn't open $opt_output.isbn: $!\n"; } else { open ( FILE, ">ean2bib.bib.isbn") or die "Couldn't open ean2bib.bib.isbn: $!\n"; } for (@isbns) { print FILE $_,"\n"; } close (FILE); } while (@isbns) { $isbn_list = join ( ",",splice(@isbns,0,30) ); my $Amazon_webpage = get("http://xml.amazon.com/onca/xml2?t=webservices-20&dev-t=$devtok&AsinSearch=$isbn_list&type=lite&f=xml"); if ( $Amazon_webpage ) { my $root = $simple->XMLin( $Amazon_webpage ); my $details = $root->{'Details'}; for (@$details) { for ( keys %bib ) { $bib{$_} = '?'; } ParseAmazon( $_, \%bib ); print OUTPUT "\@book{,\n"; print OUTPUT " author=\{", $bib{'author'}, "\}\n"; print OUTPUT " title=\{", $bib{'title'}, "\}\n"; print OUTPUT " publisher=\{", $bib{'publisher'}, "\}\n"; print OUTPUT " address=\{", $bib{'address'}, "\}\n"; print OUTPUT " year=\{", $bib{'year'}, "\}\n"; print OUTPUT " ISBN=\{", $bib{'ISBN'}, "\}\n"; if ($opt_location) { print OUTPUT " location=\{",$opt_location,"\}\n"; } if ($opt_image) { print OUTPUT " image=\{",$bib{'image'},"\}\n"; } print OUTPUT "}\n\n"; } } else { print "Bad luck this time\n"; } } close ( OUTPUT ); sub filter { my $file = shift; open ( FILE, $file ) or die "Couldn't open $file: $!\n"; print STDERR "$file:\n"; while () { chomp; if ($opt_isbn) { my ( $ok, $checkdigit ) = checkISBN($_); if ($ok) { push ( @isbns, $_ ); } else { if ($checkdigit eq '-') { print STDERR "Too few characters in ISBN '$_', 10 required.\n"; } elsif ($checkdigit eq '+') { print STDERR "Too many characters in ISBN '$_', 10 required.\n"; } else { print STDERR "Bad check digit in ISBN '$_', one or more characters wrong.\n" } } } else { push ( @isbns, ean2isbn($_) ); } } close ( FILE ); } sub ParseAmazon { my ( $detail, $bib ) = @_; $bib->{'title'} = clean( $detail->{'ProductName'} ); $bib->{'publisher'} = clean( $detail->{'Manufacturer'} ); $bib->{'ISBN'} = clean( $detail->{'Asin'} ); $bib->{'image'} = $detail->{'ImageUrlSmall'}; my $year = $detail->{'ReleaseDate'}; $year =~ s/.*? (\d+)$/$1/; $bib->{'year'} = $year; my $authors = $detail->{'Authors'}->{'Author'}; my @list; if ( ref($authors) eq 'ARRAY' ) { for (@$authors) { push ( @list, $_ ); } unless ($opt_authors) { $bib->{'author'} = clean( join ( " and ", @list ) ); } } else { $bib->{'author'} = clean($authors); } my $webpage = get("http://lcweb.loc.gov/cgi-bin/zclient?host=z3950.loc.gov&port=7090&attrset=BIB1&rtype=USMARC&DisplayRecordSyntax=HTML&ESN=F&startrec=1&maxrecords=10&dbname=Voyager&srchtype=1,7,2,3,3,1,4,1,5,1,6,1&term_term_1=$bib->{'ISBN'}"); scrapeLOC($webpage,$bib); if ($bib->{'address'} eq '?' and $opt_lookup) { if (exists($lookup{$bib{'publisher'}})) { $bib->{'address'} = $lookup{$bib{'publisher'}}; } } if ($bib->{'author'} eq '?' and scalar(@list)) { $bib->{'author'} = clean( join ( " and ", @list ) ); } } sub GetSubfield { my ( $subfield, $key ) = @_; if ( ref($subfield) eq 'ARRAY' ) { for (@$subfield) { if ( $_->{'code'} eq $key ) { return clean( $_->{'content'} ); } } } else { return clean( $subfield->{'content'} ); } } sub scrapeLOC{ my ( $webpage, $bib ) = @_; if ($webpage and $webpage !~ /0<\/b> records/) { my @list; my %fields; while ($webpage =~ /^(.*?:)/mg) { push(@list,$1); } for (0..@list - 2) { $fields{$list[$_]} = $list[$_ + 1]; } $fields{$list[-1]} = '<\/PRE>'; if (exists($fields{'Title:'})) { my $title = parsespan('Title:',$fields{'Title:'},$webpage); my @title = split(/\//,$title); for (@title) { $_ = alltrim($_); } $title = $title[0]; my $author = $title[1]; $author =~ s/,/ and/g; $author =~ s/\.$//; $bib->{'title'} = $title if ($title and $bib->{'title'} eq '?'); $bib->{'author'} = $author if ($author and $bib->{'author'} eq '?'); } if (exists($fields{'Author:'}) and $bib->{'author'} eq '?') { my $author = parsespan('Author:',$fields{'Author:'},$webpage); $author =~ s/\.$//; $bib->{'author'} = $author if $author; } if (exists($fields{'Edition:'}) and $bib->{'edition'} eq '?') { my $edition = parsespan('Edition:',$fields{'Edition:'},$webpage); $edition =~ s/(.*?)\s.*$/$1/; $bib->{'edition'} = $edition if $edition; } if (exists($fields{'ISBN:'}) and $bib->{'ISBN'} eq '?') { my $isbn = parsespan('ISBN:',$fields{'ISBN:'},$webpage); $isbn =~ s/.*?(\d{9}\d|X).*/$1/; $bib->{'ISBN'} = $isbn; } if (exists($fields{'Published:'})) { my $published = parsespan('Published:',$fields{'Published:'},$webpage); $published =~ /(.*?)\s:\s(.*?),.*?(\d{4}).*$/; $bib->{'publisher'} = $2 if ($2 and $bib->{'publisher'} eq '?'); $bib->{'address'} = alltrim($1) if ($1 and $bib->{'address'} eq '?'); $bib->{'year'} = $3 if ($3 and $bib->{'year'} eq '?'); } } } sub parsespan { my ($begin,$end,$s) = @_; $s =~ /$begin((?:(?!$begin).)*)$end/ms; my $span = $1; $span =~ s/\n/ /g; $span =~ s/\s\s+/ /g; return $span; } sub alltrim { my $s = shift; $s =~ s/^\s+//; $s =~ s/\s+$//; return $s; } sub ean2isbn { my $isbn = substr( shift, 3, 10 ); return substr($isbn,0,9) . checkDigit($isbn); } sub checkISBN { my $isbn = shift; my $n = length($isbn); if ( $n != 10 ) { return ( 0, ( $n < 10 ? '-' : '+' ) ); } else { my $cd = checkDigit($isbn); return ((($cd eq substr($isbn,-1,1)) ? 1 : 0), $cd); } } sub checkDigit { my @digits = split ( //, uc(shift) ); my $sum = 0; my $m = 10; for ( 0 .. @digits - 2 ) { $sum += $digits[$_] * $m--; } return qw(0 X 9 8 7 6 5 4 3 2 1) [ $sum % 11 ]; } sub clean { my $s = shift; $s =~ s/[,:\\\/.]$//; $s =~ s/\s+$//; $s =~ s/[\x7f-\xff]//g; return $s; } =head1 NAME ean2bib.pl -- Application to create BibTeX data from EANs. =head1 SYNOPSIS ean2bib.pl [options] filespec Options: -debug set debug level, default is off -help brief help message -man full documentation -version version number -lookup publisher address lookup file -isbn ISBN entry versus EAN entry, default is EAN entry -output name of file to write bib entries to, defaults to 'ean2bib.bib' -noappend turn append mode for output ON or OFF, defaults to OFF (do append) -location add 'location' tag to bib entry. -image add 'image' tag to bib entry using Amazon's small image -nolist suppress backup of ISBNs, defaults to OFF (do backup list) -authors use Library of Congress lookup instead of Amazon for 'authors' field Switches that don't define a value can be done in long or short form. eg: ean2bib.pl --man ean2bib.pl -m =head1 OPTIONS =over 8 =item B<-debug> Display debug information as program is executed. Control is set by level of the value passed on the command line. Default value is off (debug == 0). Setting this to something greateer than zero will also send the output to STDOUT overriding the normal defaults. Skips appending etc. =item B<-help> Print a brief help message and exit. =item B<-man> Print the manual page (full documentation) and exit. =item B<-version> Print the version number and exit. =item B<-lookup> Specify a 'lookup' file for publisher addresses. =item B<-isbn> Flag to allow ISBN entry versus EAN entry. Default is EAN entry. =item B<-output> Name of file to write output to. Default is 'ean2bib.bib'. =item B<-noappend> Flag to turn off append mode for output file. Default is OFF (do append). =item B<-loction> Turn on 'location' tag for bib entry. =item B<-image> Turn on 'image' tag of bib entry. Currently 'small' image is hardcoded in. =item B<-nolist> Suppress backup of ISBNs, defaults to OFF (do backup list). =item B<-authors> Note AMAZON currently has a bug in that they think it is alright to list 'Vic Broquard', 'Broquard Vic', and 'Victor E. Broquard' as multiple authors! Using -authors forces a LOC lookup instead of accepting the AMAZON lookup for those cases of multiple authorship. Does not affect single authorship lookup. =back =head1 DESCRIPTION This application uses the AMAZON web services API and the Library of Congress MARC database to create a BibTeX entry based solely on the EAN taken either from text files specified on the command line or entered by way of a bar code wand at the command line. =head2 EXAMPLE Suppose you had a file of EANs, say eans.txt that contained: 9780446611336 9780451458711 9780446610902 9780596004361 9780201185379 9780201489460 9780764545696 9780138482763 One EAN per line. Then say you typed 'ean2bib eans.txt' at the command line. The result would be a the creation of a file called 'ean2bib.bib.isbn' and another called 'ean2bib.bib'. The '.bib' file would be appended to if it already existed. On the first run however, it would look like (shorted slightly for POD): @book{, author={Rebecca Neason} title={The Truest Power} publisher={Warner Books} address={?} year={2002} ISBN={0446611336} } @book{, author={Alan F. Troop} title={The Dragon Delasangre} publisher={Roc} address={New York} year={2002} ISBN={0451458710} } @book{, author={Katherine Kurtz} title={Crusade of Fire: Mystical Tales of the Knights Templar} publisher={Warner Books} address={?} year={2002} ISBN={0446610909} } @book{, author={Peter Prinz and Ulla Kirch-Prinz} title={C Pocket Reference} publisher={O'Reilly & Associates} address={?} year={2002} ISBN={0596004362} } . . . @book{, author={Vic Broquard and Broquard Vic and Victor E. Broquard} title={Intermediate Mfc} publisher={Prentice Hall PTR} address={Upper Saddle River, NJ} year={1998} ISBN={0138482764} } In other words, the output is suitable as input into a LaTeX document as a .bib file. Noticeable is the occasional 'address={?}' entry. This is because of two things; first, Amazon doesn't supply publisher address information. Second, while the library of Congress is better, sometimes it doesn't either! Because of this, the --lookup option was added. This option supplies a text file to use as a address lookup table backstop. As an example: A. H. Baily and Co.|London, UK Ace Books|New York, NY Ace|New York, NY ActiveState Tool Corp.|Vancouver, Canada, BC Addison Wesley|Boston, MA Addison Wesley|Reading, MA Addison-Wesley|Upper Saddle River, NJ Aeonian Press|Mattituck, NY Al. Kalmajs Prtg. Co.|Chicago, Illinois Aladdin|New York, NY Albert E. Woolum|N. Richland Hills, TX Alfred Kalnajs and Son|Gulfport, FL Alfreds Kalnajs|Chicago, Illinois American Chess Promotions|Macon, GA American Chess Promotion|Macon, GA American Chess Quarterly|Columbus, OH Andreyevski Flag|Himberg Austria Arbor House|New York, NY Aspect|New York, NY Atheneum|New York, NY Atria|New York, NY AvoNova|New York, NY Avon Books (Trd)|New York, NY Avon|New York, NY Forms the first few lines of my 'address.txt' file. The format is simple; publisher name followed by publisher address, delimited by '|'. Also noticeable is the 'author={Vic Broquard and Broquard Vic and Victor E. Broquard} from the last EAN in the example. Amazon currently thinks this is a good thing! I think it is a bug, but Amazon is bigger than I am. At any rate, I've added the --authors option to fix this. Briefly it skips the author information from Amazon in favor of the author information from the Library of Congress lookup. =head2 WHAT IS LATEX? LaTeX is a high-quality typesetting system, with features designed for the production of technical and scientific documentation. LaTeX is the de facto standard for the communication and publication of scientific documents. =head2 WHAT IS BIBTEX? From http://www.santafe.edu/~vince/MacBibTeX.html: BibTeX is the bibliography handling tool related to the TeX/LaTeX typesetting system (available on almost all conceivable OS platforms). When a document is typeset with LaTeX, an auxiliary file (with extension '.aux') is generated. This auxiliary file contains, amongst other things, a list of references (to books, articles, research reports, web-url's,...) cited in the original document. BibTeX takes this list, together with the name of a style file (extension '.bst') and a list of bibliography databases (extension '.bib') which are also given in the auxiliary file, and produces a wonderfully formatted list of references which the LaTeX system subsequently (and automatically) appends to your typeset document. For those who think this sounds a bit too complicated, the benefits in terms of quality of output (both textual and mathematical) and automatic generation of all manner of numbering schemes (figures, equations, sections), and document parts (table of contents, index, bibliography), together with cross-platform availability, mean that many people consider it the _best_ current system for producing quality documents. =head2 WHAT IS A BIB FILE? From http://pax.st.usm.edu/~kolibal/tex_html/bib_html/bib.html: There are several approaches to including a bibliography into a LaTeX document. The easiest, but not the best is to construct a bbl file. The bbl file, of FILE.bbl where file is the name of the LaTeX file simply contains the command LaTeX interprets to write out the bibliography. The difficulty with a bbl file is that it is rigid. Once constructed, in order to change the formatting or ordering, manually rewriting the file is the only option available. Instead, the preferred mechanism for producing a bibliography is to construct a bib file, i.e., a file, usually with the name FILE.bib. The bib file is a database file, which can be processed by the utility bibtex to produce a bbl file. The advantage is that the formatting of the bbl file is then determined by a style template, consequently the bibliography can easily be reforatted to meet the requirements of a particular publication. =head2 WHAT IS A EAN? EAN stands for European Article Number and in this context refers to the Bookland EAN, a thirteen digit, strictly numeric bar code for the publishing industry. It is derived from the ISBN number assigned to a particular title. =head1 AUTHOR Hugh S. Myers hsmyers@sdragons.com =head1 BUGS None that I know of. =head1 TODO Mon May 26 10:51:13 2003 Add LCCN option for older books. Mon May 26 11:08:04 2003 Convert file entry and command line entry to use same code. Mon May 26 12:55:47 2003 Correct flaw in logic for checking EANs. DONE Mon May 26 13:50:10 2003 Add 'image' option for Amazon thumbnail image URLs. DONE Fri Jun 06 09:21:52 2003 Drop VBZOOMC.ZoomFactory in favor of a web scrape. DONE Fri Jun 06 09:23:00 2003 Add CS1504 option to set barcode type and location info. Fri Jun 06 09:45:21 2003 Begin adding debug features for testing. DONE Fri Jun 06 11:16:20 2003 Add 'authors' option as a Amazon bug work-around. DONE =head1 UPDATES 0.02 Removes dependancy on ActiveX .dll. Improve documentation. =cut __DATA__ 9780446611336 9780451458711 9780446610902 9780596004361 9780201185379 9780201489460 9780764545696 9780138482763