Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

CiteULike tools

by eweaverp (Scribe)
on Dec 08, 2005 at 08:46 UTC ( #515163=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info
Description:

bib_canonicalizer.pl - takes a citeulike url or a .bib file and searches it against the Collection of Computer Science Bibliographies, outputing a canonicalized file with all available fields filled. It does not clobber your citeulike URLs or tags.

citeulike_pdf_grabber.pl - downloads and caches all .pdfs it can find from your citeulike account (including private) and spools them to a printer. Will not print duplicates even over multiple runs as long as you don't delete the cache folder. It's good for printing only the things you have most recently added. Outputs a "missing.html" file with links to the citeulike articles it could not find a .pdf for. You will probably have to customize some of the regexs for the databases you use the most.

################################### bib_canonicalizer.pl
#!/alruccabah/local/usr/bin/perl

use strict;
use warnings;
#use diagnostics;

my $username = "whatever";

my $base = "http://liinwww.ira.uka.de";
my $url = "$base/csbib";

my $citeulike = "http://www.citeulike.org/bibtex/user/";

my $input = $ARGV[0];
my $output = $ARGV[1];

die("Specify your citeulike username/tag path or an input .bib file.\n
+") unless ($input);

use Text::BibTeX qw(:macrosubs);
use LWP;
use HTML::Strip;
use URI::Escape;
use String::Compare;

my $agent = LWP::UserAgent->new();
my $stripper = HTML::Strip->new();

my %month = (jan => "January", feb => "February", mar => "March",
     apr => "April", may => "May", jun => "June", jul => "July", 
     aug => "August", sep => "September", oct => "October",
     nov => "November", dec => "December");
my ($macro, $value);
add_macro_text($macro, $value) while (($macro, $value) = each %month);

#my $Input = new Text::BibTeX::File($input);

my @Input;

if ($input =~ m/^$username/) {
  print "Trying to retrieve from $citeulike$input.\n";
  my $response = $agent->get($citeulike . $input,
                                                         [do_username_
+prefix => 0,
                                                         key_type => 0
+]
                                                         );
    open(TMP, ">tmp.bib");
    print TMP $response->content;
    close(TMP);
    
    @Input = split(/\n\@/, $response->content);
    
    if (!$output) {
      ($output) = $input =~ m/.*\/(.*)/;
        $output .= ".bib";
    }
} else {
  die unless ($output);
    
    local $/ = undef;
    open(INPUT, "$input");
  @Input = split(/\n\@/, <INPUT>);
}

shift(@Input);


open(OUT, ">$output");

#my $Output = new Text::BibTeX::File(">$output");

my $entry_text;
my $pass = 0;
my $entry;
my ($failed, $total) = (0, 0);

foreach $entry_text (@Input) {
  #print "$entry_text\n";
    #exit();

     $entry = new Text::BibTeX::Entry("@" . $entry_text);
    next unless $entry->parse_ok; 
    
    close(OUT);

    my @names = $entry->names("author");
    my $authors = "";

    foreach (@names) {
      my ($tmp_author) = join(" ", $_->part("last"));
        $authors .= " " . $tmp_author;
    }

    my ($title) = $entry->get("title");
    my ($year) = $entry->get("year");

    my $new_entry;
    
    my $continue = 0;

    do {
    
        if ($pass == 0) {
          $new_entry = lookup_single($title, $authors);
        } elsif ($pass == 1) {
          $new_entry = lookup_single($title, $year);
        } elsif ($pass == 2) {
          $new_entry = lookup_single($authors, $year);
        }

        $pass++;

        print "  Pass $pass.\n";
        
        if ($new_entry) {
        
          $new_entry = new Text::BibTeX::Entry($new_entry);

            $new_entry->delete(["crossref"]);

            my $new_title = $new_entry->get("title");
            if (compare($title, $new_title) > .5) {
              $continue = 1;
            }
    }
    } while ($pass < 3 && !$continue);
    
    if (!$continue) {
        print "  Match fail.\n";
           $new_entry = $entry;
            $failed++;
    }
    
    my %new_fieldlist;
    my @field_array = $new_entry->fieldlist;
    foreach (@field_array) {
      $new_fieldlist{$_} = 1;
    }
    
    foreach ($entry->fieldlist) {
      if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/)
          $new_entry->set($_, $entry->get($_));
      }
    }
    
    open(OUT, ">>$output");
    print OUT $new_entry->print_s;
    print OUT "\n\n";
    
    print "Done with $title, $authors.\n\n";

    $pass = 0;
    $total++;

}

close (OUT);

print "Ran $total entries; $failed failed.\n";

sub lookup_single {
  my ($string_left, $string_right) = @_;
    
    if (!defined($string_right)) {
      $string_right = "";
    }
    
    my $query;
    my $year = "";
    if ($string_right =~ m/^\d\d\d\d$/) {
      $year = $string_right;
      $query = $string_left;
    } else {
      $query = "$string_left $string_right";
    }
    
    $query =~ s/\{|\}//mg;
    $query =~ s/\.$//mg;
    
    my $response = $agent->post($url,
        [ maxnum => 10,
            query => $query,
            results => "citation",
            sort => "score",
            year => $year
        ]
      );



    if ($response->content() =~ m/accesskey\=/) {
        my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".
+*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg;

        #print "$base$bibtex_url\n";
        #exit();
        if (!defined($bibtex_url)) {
            print "Bibtex URL error on $string_left, $string_right.\n"
+;
        } else {

            $bibtex_url =~ s/\&amp\;/&/gmi; #stupid html escaping

            $response = $agent->get(uri_unescape("$base$bibtex_url"));
            my ($html) = $response->content() =~ m/.*?<pre class=\"bib
+tex\">(.*?)<\/pre>/sg;

            #print "$base$bibtex_url\n";
            #print $html; 
            #exit();

            if (!defined($html)) {
                print "<pre> tag parse error on $string_left, $string_
+right.\n";
            } else {
                return $stripper->parse($html);
            }
        }
    } else {
        print "No accesskey for $string_left, $string_right.\n";
    }

    return "";
}


#sub value_to_string {
#  my $string = "";
#    my $value = shift;
#    my @all_values = $value->values;
#    my $simpleval;
#    foreach $simpleval (@all_values) {
#      $string .= $simpleval->text . " ";
#    }
#}



################################### citeulike_pdf_grabber.pl
#!/alruccabah/local/usr/bin/perl

use warnings;
use strict;

use WWW::Mechanize;

print "Starting up.\n";

my $username = "eweaver";
my $printer = "cis5";

my $password;
if (defined($ARGV[0])) {
  $password = $ARGV[0];
} else {
  die ("You need to supply a password, an optionally, the --print flag
+.\n");
}

my $extension = "pdf~";
my $print = 0;
if (defined($ARGV[1]) && $ARGV[1] eq "--print") {
  print "Printing enabled!\n";
    sleep(5);
  $print = 1;
    $extension = "pdf";
}


my $home = "http://www.citeulike.org/";
my $base = "$home/user/$username/";
my $mech = WWW::Mechanize->new(autocheck => 1);
my $print_cache = "/usa/$username/tmp/pdf_cache/";
chdir($print_cache);

my $missing = "missing.html";
open(MISSING, ">$missing");
close(MISSING);

print "Creating print cache...\n";
if (! -e $print_cache) {
    system("mkdir $print_cache");
} else {
  print "Exists.\n";
}

#$mech->cookie_jar(HTTP::Cookies->new);

$mech->get($base);
$mech->follow_link( text => "Log in");

$mech->form_name("frm");
$mech->set_visible($username, $password);
$mech->submit();

print "We should be authenticated now.\n";

$mech->follow_link( text => "$username");

print "At the base page.\n";

my @tag_links_all = $mech->links();
my @tag_links;

print "Searching for tags.\n";

my $found_rss = 0;
foreach (@tag_links_all) {
  if ($found_rss) {
      if ($_->url() =~ m/\/tag\//) {
          push (@tag_links, $_);
            print "  Found tag " . $_->text() . ".\n";
        }
    } elsif ($_->text() eq "RSS") {
      $found_rss = 1;
    }
}
#my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$usernam
+e\/tag\//);

print "Found " . scalar(@tag_links) . " tags.\n";

my $tag_url;

my $miss = 0;

foreach $tag_url (@tag_links) {
  my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/;
    
  print "  Fetching $tag.\n";
    
    $mech->get($base);
    $mech->get($tag_url);
    
    my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$use
+rname\/article\/\d+$/ );
  print "  Found " . scalar(@cite_links) . " citations.\n";
        
  my $cite_url;
    
    foreach $cite_url (@cite_links) {
      print "    Looking for article at: " . $cite_url->url() . "\n";
      $mech->get($base);
        $mech->get($cite_url);
        if (!$mech->success()) {
          print "    Not found.\n\n";
            next;
        }
        
        my $title = $mech->title();
        $title =~ s/CiteULike\: //;
        
        my $partial_hit_flag = 0;
        
        print "    Fetched article $title.\n";
        if (-e "$title.pdf" || -e "$title.$extension") {
          print "    Article exists in print cache.\n";
        } else {
          print "    Retrieving article.\n";
            my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/us
+er\/$username\/.*\.pdf/i);
            if ($link) {
              $mech->get($link);
              print "      Found a personal .pdf at " . $link->url() .
+ ".\n";
                $mech->save_content("$title.$extension");
                print "      Saved it.\n";
          } else {
              print "      Trying to follow an external link.\n";
#              $mech->get($cite_url); # reset success() flag

                for (my $i = 0; $i < 6; $i++) {
                    $link = $mech->find_link( url_regex => qr/\.pdf\s*
+$/ );
                    if ($link) {
                      print "        I found a direct link.\n";
                        $mech->get($link);
                        $mech->save_content("$title.$extension");
                      print "        Saved it.\n";
                        last;
                    } else {
                      print "        Descending URL tree ($i).\n";
                        
                        my @regexs = ( "View article online",
                                                     "PDF",
                                                     "Full text",
                                                     "here" );
                        
                      #my $link = $mech->find_link( text => "View arti
+cle online" );
                        #if (!$link) {
                          for (my $j = 0; $j < scalar(@regexs); $j++) 
+{
                              $link = $mech->find_link(  text_regex =>
+ qr/$regexs[$j]/i);
                                if ($link) {
                                  print "        Found a \"" . $regexs
+[$j] . "\" link.\n";
                                  $mech->get($link);
                                    $partial_hit_flag++;
                                    last;
                                }
                          }
                        #} else {
                        #  print "        Found \"View article online\
+" link.\n";
                        #    $mech->get($link);
                        #}
                    }
                } #while ($link);
            }
            if (!-e "$title.pdf" && !-e "$title.$extension")    {
              print  "  Couldn't find any match for:\n    $title\n    
+" . $cite_url->url() . "    \n";
              $miss++;
                open(MISSING, ">>$missing");
                print MISSING "<p>";
                if ($partial_hit_flag > 0) {
                    print MISSING "<b>Please check following (partial 
+hits $partial_hit_flag):</b><br>\n";
                }
                print MISSING "<a href=\"" . $home . $cite_url->url() 
+. "\">$title</a></p>\n";
                close(MISSING);
            } else {
              if ($print) {
                  print "  Printing $title.\n";
                    if (-e "tmp.ps") {
                        system("rm \"tmp.ps\"");
                    }
                    if (-e "$title.pdf~") {
                      print "  Removed non-printed cache file.\n";
                        system("rm \"$title.pdf~\"");
                    }
                    system("acroread -toPostScript -start 1 -end 40 -p
+airs \"$title.pdf\" \"tmp.ps\"");
                    print "  Converted to .ps.\n";
                    sleep(3);
                    system("lpr -P$printer \"tmp.ps\"");
                }
            }
        }
    print "\n";
    }
}                
            
print "Done. There were $miss articles I couldn't find.\n";

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://515163]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others imbibing at the Monastery: (5)
As of 2020-09-29 23:12 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    If at first I donít succeed, I Ö










    Results (153 votes). Check out past polls.

    Notices?