#!/perl/bin/perl
#
# ean2bib.pl -- Application converts EANs to BibTeX data.
use strict;
use warnings;
use diagnostics;
use Getopt::Long;
use Pod::Usage;
use LWP::Simple;
use XML::Simple;

our $VERSION = '0.02';

my $devtok = $ENV{'AMAZON_DEVTOK'};
my $simple = XML::Simple->new();
my %bib    = (
    author    => '?',
    title     => '?',
    publisher => '?',
    address   => '?',
    edition   => '?',
    year      => '?',
    ISBN      => '?',
    image     => '?',
);
my $isbn_list;
my %lookup;
my @isbns;
my $isbn;

GetOptions(
    'debug=i'    => \( my $debug = 0 ),
    'help|?'     => \( my $opt_help ),
    'man'        => \( my $opt_man ),
    'version'    => \( my $opt_version ),
    'lookup=s'   => \( my $opt_lookup ),
    'isbn'       => \( my $opt_isbn ),
    'output'     => \( my $opt_output ),
    'noappend'   => \( my $opt_noappend ),
    'location=s' => \( my $opt_location ),
    'image'      => \( my $opt_image ),
    'nolist'     => \( my $opt_nolist ),
    'authors'    => \( my $opt_authors ),
) or pod2usage(2);
if ($opt_version) {
    print "ean2bib.pl vrs. $VERSION\n";
    exit;
}
pod2usage(1) if $opt_help;
pod2usage( -verbose => 2 ) if $opt_man;
if ($debug) {
    *OUTPUT = *STDOUT;
}
else {
    if ($opt_output) {
        if ($opt_noappend) {
            open ( OUTPUT , ">$opt_output" ) or die "Can't open $opt_output: $!\n";
        }
        else {
            open ( OUTPUT , ">>$opt_output" ) or die "Can't open $opt_output: $!\n";
        }
    }
    else {
        if ($opt_noappend) {
            open ( OUTPUT , ">ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n";
        }
        else {
            open ( OUTPUT , ">>ean2bib.bib" ) or die "Can't open ean2bib.bib: $!\n";
        }
    }
}
if ($opt_lookup) {
    open( ADDRESS, $opt_lookup ) or die "Can't open $opt_lookup: $!\n";
    while (<ADDRESS>) {
        chomp;
        my ($publisher,$address) = split(/\|/,$_);
        $lookup{$publisher} = $address;
    }
    close ( ADDRESS );
}
if ($debug) {
        while (<DATA>) {
            chomp;
            push ( @isbns, ean2isbn($_) );
        }
}
elsif (@ARGV == 0) {
    if ($opt_isbn) {
        print "Enter ISBN+newline---^Z to end. ^C to cancel.\n";
        while (<>) {
            chomp;
            my ( $ok, $checkdigit ) = checkISBN($_);
            if ($ok) {
                push ( @isbns, $_ );
            }
            else {
                if ($checkdigit eq '-') {
                    print "Too few characters in ISBN, 10 required.\n";
                }
                elsif ($checkdigit eq '+') {
                    print "Too many characters in ISBN, 10 required.\n";
                }
                else {
                    print "Bad check digit in ISBN, one or more characters wrong.\n"
                }
            }
        }
    }
    else {
        print "Enter EAN+newline---^Z to end. ^C to cancel\n";
        while (<>) {
            chomp;
            push ( @isbns, ean2isbn($_) );
        }
    }
}
else {
    for (map { glob } @ARGV) {
        filter($_);
    }
}
unless ($opt_nolist) {
    if ($opt_output) {
        open ( FILE, ">$opt_output.isbn") or die "Couldn't open $opt_output.isbn: $!\n";
    }
    else {
        open ( FILE, ">ean2bib.bib.isbn") or die "Couldn't open ean2bib.bib.isbn: $!\n";
    }
    for (@isbns) {
        print FILE $_,"\n";
    }
    close (FILE);
}
while (@isbns) {
    $isbn_list = join ( ",",splice(@isbns,0,30) );
    my $Amazon_webpage = get("http://xml.amazon.com/onca/xml2?t=webservices-20&dev-t=$devtok&AsinSearch=$isbn_list&type=lite&f=xml");
    if ( $Amazon_webpage ) {
        my $root    = $simple->XMLin( $Amazon_webpage );
        my $details = $root->{'Details'};
        for (@$details) {
            for ( keys %bib ) {
                $bib{$_} = '?';
            }
            ParseAmazon( $_, \%bib );
            print OUTPUT "\@book{,\n";
            print OUTPUT "  author=\{",    $bib{'author'},    "\}\n";
            print OUTPUT "  title=\{",     $bib{'title'},     "\}\n";
            print OUTPUT "  publisher=\{", $bib{'publisher'}, "\}\n";
            print OUTPUT "  address=\{",   $bib{'address'},   "\}\n";
            print OUTPUT "  year=\{",      $bib{'year'},      "\}\n";
            print OUTPUT "  ISBN=\{",      $bib{'ISBN'},      "\}\n";
            if ($opt_location) {
                print OUTPUT "  location=\{",$opt_location,"\}\n";
            }
            if ($opt_image) {
                print OUTPUT "  image=\{",$bib{'image'},"\}\n";
            }
            print OUTPUT "}\n\n";
        }
    }
    else {
        print "Bad luck this time\n";
    }
}
close ( OUTPUT );

sub filter {
    my $file = shift;
    open ( FILE, $file ) or die "Couldn't open $file: $!\n";
    print STDERR "$file:\n";
    while (<FILE>) {
        chomp;
        if ($opt_isbn) {
            my ( $ok, $checkdigit ) = checkISBN($_);
            if ($ok) {
                push ( @isbns, $_ );
            }
            else {
                if ($checkdigit eq '-') {
                    print STDERR "Too few characters in ISBN '$_', 10 required.\n";
                }
                elsif ($checkdigit eq '+') {
                    print STDERR "Too many characters in ISBN '$_', 10 required.\n";
                }
                else {
                    print STDERR "Bad check digit in ISBN '$_', one or more characters wrong.\n"
                }
            }
        }
        else {
            push ( @isbns, ean2isbn($_) );
        }
    }
    close ( FILE );
}

sub ParseAmazon {
    my ( $detail, $bib ) = @_;
    $bib->{'title'}     = clean( $detail->{'ProductName'} );
    $bib->{'publisher'} = clean( $detail->{'Manufacturer'} );
    $bib->{'ISBN'}      = clean( $detail->{'Asin'} );
    $bib->{'image'} = $detail->{'ImageUrlSmall'};
    my $year = $detail->{'ReleaseDate'};
    $year =~ s/.*? (\d+)$/$1/;
    $bib->{'year'} = $year;
    my $authors = $detail->{'Authors'}->{'Author'};
    my @list;

    if ( ref($authors) eq 'ARRAY' ) {
        for (@$authors) {
            push ( @list, $_ );
        }
        unless ($opt_authors) {
            $bib->{'author'} = clean( join ( " and ", @list ) );
        }
    }
    else {
        $bib->{'author'} = clean($authors);
    }
    my $webpage = get("http://lcweb.loc.gov/cgi-bin/zclient?host=z3950.loc.gov&port=7090&attrset=BIB1&rtype=USMARC&DisplayRecordSyntax=HTML&ESN=F&startrec=1&maxrecords=10&dbname=Voyager&srchtype=1,7,2,3,3,1,4,1,5,1,6,1&term_term_1=$bib->{'ISBN'}");
    scrapeLOC($webpage,$bib);
    if ($bib->{'address'} eq '?' and $opt_lookup) {
        if (exists($lookup{$bib{'publisher'}})) {
            $bib->{'address'} = $lookup{$bib{'publisher'}};
        }
    }
    if ($bib->{'author'} eq '?' and scalar(@list)) {
            $bib->{'author'} = clean( join ( " and ", @list ) );
    }
}

sub GetSubfield {
    my ( $subfield, $key ) = @_;
    if ( ref($subfield) eq 'ARRAY' ) {
        for (@$subfield) {
            if ( $_->{'code'} eq $key ) {
                return clean( $_->{'content'} );
            }
        }
    }
    else {
        return clean( $subfield->{'content'} );
    }
}

sub scrapeLOC{
    my ( $webpage, $bib ) = @_;
    if ($webpage and $webpage !~ /<b>0<\/b> records/) {
        my @list;
        my %fields;
        while ($webpage =~ /^(.*?:)/mg) {
            push(@list,$1);
        }
        for (0..@list - 2) {
            $fields{$list[$_]} = $list[$_ + 1];
        }
        $fields{$list[-1]} = '<\/PRE>';

        if (exists($fields{'Title:'})) {
            my $title = parsespan('Title:',$fields{'Title:'},$webpage);
            my @title = split(/\//,$title);
            for (@title) {
                $_ = alltrim($_);
            }
            $title = $title[0];
            my $author = $title[1];
            $author =~ s/,/ and/g;
            $author =~ s/\.$//;
            $bib->{'title'} = $title if ($title and $bib->{'title'} eq '?');
            $bib->{'author'} = $author if ($author and $bib->{'author'} eq '?');
        }
        if (exists($fields{'Author:'}) and $bib->{'author'} eq '?') {
            my $author = parsespan('Author:',$fields{'Author:'},$webpage);
            $author =~ s/\.$//;
            $bib->{'author'} = $author if $author;
        }
        if (exists($fields{'Edition:'}) and $bib->{'edition'} eq '?') {
            my $edition = parsespan('Edition:',$fields{'Edition:'},$webpage);
            $edition =~ s/(.*?)\s.*$/$1/;
            $bib->{'edition'} = $edition if $edition;
        }
        if (exists($fields{'ISBN:'}) and $bib->{'ISBN'} eq '?') {
            my $isbn = parsespan('ISBN:',$fields{'ISBN:'},$webpage);
            $isbn =~ s/.*?(\d{9}\d|X).*/$1/;
            $bib->{'ISBN'} = $isbn;
        }
        if (exists($fields{'Published:'})) {
            my $published = parsespan('Published:',$fields{'Published:'},$webpage);
            $published =~ /(.*?)\s:\s(.*?),.*?(\d{4}).*$/;
            $bib->{'publisher'} = $2 if ($2 and $bib->{'publisher'} eq '?');
            $bib->{'address'} = alltrim($1) if ($1 and $bib->{'address'} eq '?');
            $bib->{'year'} = $3 if ($3 and $bib->{'year'} eq '?');
        }
    }
}

sub parsespan {
    my ($begin,$end,$s) = @_;
    $s =~ /$begin((?:(?!$begin).)*)$end/ms;
    my $span = $1;
    $span =~ s/\n/ /g;
    $span =~ s/\s\s+/ /g;
    return $span;
}

sub alltrim {
    my $s = shift;
    $s =~ s/^\s+//;
    $s =~ s/\s+$//;
    return $s;
}

sub ean2isbn {
    my $isbn = substr( shift, 3, 10 );
    return substr($isbn,0,9) . checkDigit($isbn);
}

sub checkISBN {
    my $isbn = shift;
    my $n = length($isbn);

    if ( $n != 10 ) {
        return ( 0, ( $n < 10 ? '-' : '+' ) );
    }
    else {
        my $cd = checkDigit($isbn);
        return ((($cd eq substr($isbn,-1,1)) ? 1 : 0), $cd);
    }
}

sub checkDigit {
    my @digits = split ( //, uc(shift) );
    my $sum = 0;
    my $m   = 10;

    for ( 0 .. @digits - 2 ) {
        $sum += $digits[$_] * $m--;
    }
    return qw(0 X 9 8 7 6 5 4 3 2 1) [ $sum % 11 ];
}

sub clean {
    my $s = shift;
    $s =~ s/[,:\\\/.]$//;
    $s =~ s/\s+$//;
    $s =~ s/[\x7f-\xff]//g;
    return $s;
}

=head1 NAME

 ean2bib.pl -- Application to create BibTeX data from EANs.

=head1 SYNOPSIS

ean2bib.pl [options] filespec

 Options:

  -debug     set debug level, default is off
  -help      brief help message
  -man       full documentation
  -version   version number
  -lookup    publisher address lookup file
  -isbn      ISBN entry versus EAN entry, default is EAN entry
  -output    name of file to write bib entries to, defaults to 'ean2bib.bib'
  -noappend  turn append mode for output ON or OFF, defaults to OFF (do append)
  -location  add 'location' tag to bib entry.
  -image     add 'image' tag to bib entry using Amazon's small image
  -nolist    suppress backup of ISBNs, defaults to OFF (do backup list)
  -authors   use Library of Congress lookup instead of Amazon for 'authors' field

 Switches that don't define a value can be done in long or short form.
 eg:
   ean2bib.pl --man
   ean2bib.pl -m

=head1 OPTIONS

=over 8

=item B<-debug>

Display debug information as program is executed. Control is set by level of the value
passed on the command line. Default value is off (debug == 0). Setting this to something
greateer than zero will also send the output to STDOUT overriding the normal defaults.
Skips appending etc.

=item B<-help>

Print a brief help message and exit.

=item B<-man>

Print the manual page (full documentation) and exit.

=item B<-version>

Print the version number and exit.

=item B<-lookup>

Specify a 'lookup' file for publisher addresses.

=item B<-isbn>

Flag to allow ISBN entry versus EAN entry. Default is EAN entry.

=item B<-output>

Name of file to write output to. Default is 'ean2bib.bib'.

=item B<-noappend>

Flag to turn off append mode for output file. Default is OFF (do append).

=item B<-loction>

Turn on 'location' tag for bib entry.

=item B<-image>

Turn on 'image' tag of bib entry. Currently 'small' image is hardcoded in.

=item B<-nolist>

Suppress backup of ISBNs, defaults to OFF (do backup list).

=item B<-authors>

Note AMAZON currently has a bug in that they think it is alright to list
'Vic Broquard', 'Broquard Vic', and 'Victor E. Broquard' as multiple authors!
Using -authors forces a LOC lookup instead of accepting the AMAZON lookup for
those cases of multiple authorship. Does not affect single authorship lookup.

=back

=head1 DESCRIPTION

This application uses the AMAZON web services API and the Library of Congress
MARC database to create a BibTeX entry based solely on the EAN taken either from
text files specified on the command line or entered by way of a bar code wand at
the command line.

=head2 EXAMPLE

Suppose you had a file of EANs, say eans.txt that contained:

 9780446611336
 9780451458711
 9780446610902
 9780596004361
 9780201185379
 9780201489460
 9780764545696
 9780138482763

One EAN per line. Then say you typed 'ean2bib eans.txt' at the command line. The
result would be a the creation of a file called 'ean2bib.bib.isbn' and another 
called 'ean2bib.bib'. The '.bib' file would be appended to if it already existed. On the
first run however, it would look like (shorted slightly for POD):

    @book{,
      author={Rebecca Neason}
      title={The Truest Power}
      publisher={Warner Books}
      address={?}
      year={2002}
      ISBN={0446611336}
    }

    @book{,
      author={Alan F. Troop}
      title={The Dragon Delasangre}
      publisher={Roc}
      address={New York}
      year={2002}
      ISBN={0451458710}
    }

    @book{,
      author={Katherine Kurtz}
      title={Crusade of Fire: Mystical Tales of the Knights Templar}
      publisher={Warner Books}
      address={?}
      year={2002}
      ISBN={0446610909}
    }

    @book{,
      author={Peter Prinz and Ulla Kirch-Prinz}
      title={C Pocket Reference}
      publisher={O'Reilly & Associates}
      address={?}
      year={2002}
      ISBN={0596004362}
    }

    .
    .
    .

    @book{,
      author={Vic Broquard and Broquard Vic and Victor E. Broquard}
      title={Intermediate Mfc}
      publisher={Prentice Hall PTR}
      address={Upper Saddle River, NJ}
      year={1998}
      ISBN={0138482764}
    }

In other words, the output is suitable as input into a LaTeX document as a .bib file.

Noticeable is the occasional 'address={?}' entry. This is because of two things; first,
Amazon doesn't supply publisher address information. Second, while the library of Congress
is better, sometimes it doesn't either! Because of this, the --lookup option was added.
This option supplies a text file to use as a address lookup table backstop. As an
example:

    A. H. Baily and Co.|London, UK
    Ace Books|New York, NY
    Ace|New York, NY
    ActiveState Tool Corp.|Vancouver, Canada, BC
    Addison Wesley|Boston, MA
    Addison Wesley|Reading, MA
    Addison-Wesley|Upper Saddle River, NJ
    Aeonian Press|Mattituck, NY
    Al. Kalmajs Prtg. Co.|Chicago, Illinois
    Aladdin|New York, NY
    Albert E. Woolum|N. Richland Hills, TX
    Alfred Kalnajs and Son|Gulfport, FL
    Alfreds Kalnajs|Chicago, Illinois
    American Chess Promotions|Macon, GA
    American Chess Promotion|Macon, GA
    American Chess Quarterly|Columbus, OH
    Andreyevski Flag|Himberg Austria
    Arbor House|New York, NY
    Aspect|New York, NY
    Atheneum|New York, NY
    Atria|New York, NY
    AvoNova|New York, NY
    Avon Books (Trd)|New York, NY
    Avon|New York, NY

Forms the first few lines of my 'address.txt' file. The format is simple; publisher
name followed by publisher address, delimited by '|'.

Also noticeable is the 'author={Vic Broquard and Broquard Vic and Victor E. Broquard} from
the last EAN in the example. Amazon currently thinks this is a good thing! I think
it is a bug, but Amazon is bigger than I am. At any rate, I've added the --authors option
to fix this. Briefly it skips the author information from Amazon in favor of the author
information from the Library of Congress lookup.

=head2 WHAT IS LATEX?

LaTeX is a high-quality typesetting system, with features designed for the 
production of technical and scientific documentation. LaTeX is the de facto 
standard for the communication and publication of scientific documents. 

=head2 WHAT IS BIBTEX?

From http://www.santafe.edu/~vince/MacBibTeX.html:

BibTeX is the bibliography handling tool related to the TeX/LaTeX typesetting 
system (available on almost all conceivable OS platforms). When a document is 
typeset with LaTeX, an auxiliary file (with extension '.aux') is generated. 
This auxiliary file contains, amongst other things, a list of references (to 
books, articles, research reports, web-url's,...) cited in the original document. 
BibTeX takes this list, together with the name of a style file (extension '.bst') 
and a list of bibliography databases (extension '.bib') which are also given 
in the auxiliary file, and produces a wonderfully formatted list of references 
which the LaTeX system subsequently (and automatically) appends to your typeset 
document. For those who think this sounds a bit too complicated, the benefits in 
terms of quality of output (both textual and mathematical) and automatic generation
of all manner of numbering schemes (figures, equations, sections), and document 
parts (table of contents, index, bibliography), together with cross-platform 
availability, mean that many people consider it the _best_ current system for 
producing quality documents.

=head2 WHAT IS A BIB FILE?

From http://pax.st.usm.edu/~kolibal/tex_html/bib_html/bib.html:

There are several approaches to including a bibliography into a LaTeX document. The
easiest, but not the best is to construct a bbl file. The bbl file, of FILE.bbl
where file is the name of the LaTeX file simply contains the command LaTeX 
interprets to write out the bibliography. The difficulty with a bbl file is 
that it is rigid. Once constructed, in order to change the formatting or ordering, 
manually rewriting the file is the only option available. 

Instead, the preferred mechanism for producing a bibliography is to construct a bib
file, i.e., a file, usually with the name FILE.bib. The bib file is a database
file, which can be processed by the utility bibtex to produce a bbl file. The 
advantage is that the formatting of the bbl file is then determined by a style 
template, consequently the bibliography can easily be reforatted to meet the 
requirements of a particular publication. 

=head2 WHAT IS A EAN?

EAN stands for European Article Number and in this
context refers to the Bookland EAN,  a thirteen digit, strictly numeric bar code for
the publishing industry. It is derived from the ISBN number assigned to a particular
title. 

=head1 AUTHOR

 Hugh S. Myers
 hsmyers@sdragons.com

=head1 BUGS

None that I know of.

=head1 TODO

 Mon May 26 10:51:13 2003 Add LCCN option for older books.
 Mon May 26 11:08:04 2003 Convert file entry and command line entry to use same code.
 Mon May 26 12:55:47 2003 Correct flaw in logic for checking EANs. DONE
 Mon May 26 13:50:10 2003 Add 'image' option for Amazon thumbnail image URLs. DONE
 Fri Jun 06 09:21:52 2003 Drop VBZOOMC.ZoomFactory in favor of a web scrape. DONE
 Fri Jun 06 09:23:00 2003 Add CS1504 option to set barcode type and location info.
 Fri Jun 06 09:45:21 2003 Begin adding debug features for testing. DONE
 Fri Jun 06 11:16:20 2003 Add 'authors' option as a Amazon bug work-around. DONE

=head1 UPDATES

 0.02 Removes dependancy on ActiveX .dll. Improve documentation.

=cut

__DATA__
9780446611336
9780451458711
9780446610902
9780596004361
9780201185379
9780201489460
9780764545696
9780138482763