################################### bib_canonicalizer.pl #!/alruccabah/local/usr/bin/perl use strict; use warnings; #use diagnostics; my $username = "whatever"; my $base = "http://liinwww.ira.uka.de"; my $url = "$base/csbib"; my $citeulike = "http://www.citeulike.org/bibtex/user/"; my $input = $ARGV[0]; my $output = $ARGV[1]; die("Specify your citeulike username/tag path or an input .bib file.\n") unless ($input); use Text::BibTeX qw(:macrosubs); use LWP; use HTML::Strip; use URI::Escape; use String::Compare; my $agent = LWP::UserAgent->new(); my $stripper = HTML::Strip->new(); my %month = (jan => "January", feb => "February", mar => "March", apr => "April", may => "May", jun => "June", jul => "July", aug => "August", sep => "September", oct => "October", nov => "November", dec => "December"); my ($macro, $value); add_macro_text($macro, $value) while (($macro, $value) = each %month); #my $Input = new Text::BibTeX::File($input); my @Input; if ($input =~ m/^$username/) { print "Trying to retrieve from $citeulike$input.\n"; my $response = $agent->get($citeulike . $input, [do_username_prefix => 0, key_type => 0] ); open(TMP, ">tmp.bib"); print TMP $response->content; close(TMP); @Input = split(/\n\@/, $response->content); if (!$output) { ($output) = $input =~ m/.*\/(.*)/; $output .= ".bib"; } } else { die unless ($output); local $/ = undef; open(INPUT, "$input"); @Input = split(/\n\@/, ); } shift(@Input); open(OUT, ">$output"); #my $Output = new Text::BibTeX::File(">$output"); my $entry_text; my $pass = 0; my $entry; my ($failed, $total) = (0, 0); foreach $entry_text (@Input) { #print "$entry_text\n"; #exit(); $entry = new Text::BibTeX::Entry("@" . $entry_text); next unless $entry->parse_ok; close(OUT); my @names = $entry->names("author"); my $authors = ""; foreach (@names) { my ($tmp_author) = join(" ", $_->part("last")); $authors .= " " . $tmp_author; } my ($title) = $entry->get("title"); my ($year) = $entry->get("year"); my $new_entry; my $continue = 0; do { if ($pass == 0) { $new_entry = lookup_single($title, $authors); } elsif ($pass == 1) { $new_entry = lookup_single($title, $year); } elsif ($pass == 2) { $new_entry = lookup_single($authors, $year); } $pass++; print " Pass $pass.\n"; if ($new_entry) { $new_entry = new Text::BibTeX::Entry($new_entry); $new_entry->delete(["crossref"]); my $new_title = $new_entry->get("title"); if (compare($title, $new_title) > .5) { $continue = 1; } } } while ($pass < 3 && !$continue); if (!$continue) { print " Match fail.\n"; $new_entry = $entry; $failed++; } my %new_fieldlist; my @field_array = $new_entry->fieldlist; foreach (@field_array) { $new_fieldlist{$_} = 1; } foreach ($entry->fieldlist) { if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/) $new_entry->set($_, $entry->get($_)); } } open(OUT, ">>$output"); print OUT $new_entry->print_s; print OUT "\n\n"; print "Done with $title, $authors.\n\n"; $pass = 0; $total++; } close (OUT); print "Ran $total entries; $failed failed.\n"; sub lookup_single { my ($string_left, $string_right) = @_; if (!defined($string_right)) { $string_right = ""; } my $query; my $year = ""; if ($string_right =~ m/^\d\d\d\d$/) { $year = $string_right; $query = $string_left; } else { $query = "$string_left $string_right"; } $query =~ s/\{|\}//mg; $query =~ s/\.$//mg; my $response = $agent->post($url, [ maxnum => 10, query => $query, results => "citation", sort => "score", year => $year ] ); if ($response->content() =~ m/accesskey\=/) { my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg; #print "$base$bibtex_url\n"; #exit(); if (!defined($bibtex_url)) { print "Bibtex URL error on $string_left, $string_right.\n"; } else { $bibtex_url =~ s/\&\;/&/gmi; #stupid html escaping $response = $agent->get(uri_unescape("$base$bibtex_url")); my ($html) = $response->content() =~ m/.*?
(.*?)<\/pre>/sg; #print "$base$bibtex_url\n"; #print $html; #exit(); if (!defined($html)) { print "tag parse error on $string_left, $string_right.\n"; } else { return $stripper->parse($html); } } } else { print "No accesskey for $string_left, $string_right.\n"; } return ""; } #sub value_to_string { # my $string = ""; # my $value = shift; # my @all_values = $value->values; # my $simpleval; # foreach $simpleval (@all_values) { # $string .= $simpleval->text . " "; # } #} ################################### citeulike_pdf_grabber.pl #!/alruccabah/local/usr/bin/perl use warnings; use strict; use WWW::Mechanize; print "Starting up.\n"; my $username = "eweaver"; my $printer = "cis5"; my $password; if (defined($ARGV[0])) { $password = $ARGV[0]; } else { die ("You need to supply a password, an optionally, the --print flag.\n"); } my $extension = "pdf~"; my $print = 0; if (defined($ARGV[1]) && $ARGV[1] eq "--print") { print "Printing enabled!\n"; sleep(5); $print = 1; $extension = "pdf"; } my $home = "http://www.citeulike.org/"; my $base = "$home/user/$username/"; my $mech = WWW::Mechanize->new(autocheck => 1); my $print_cache = "/usa/$username/tmp/pdf_cache/"; chdir($print_cache); my $missing = "missing.html"; open(MISSING, ">$missing"); close(MISSING); print "Creating print cache...\n"; if (! -e $print_cache) { system("mkdir $print_cache"); } else { print "Exists.\n"; } #$mech->cookie_jar(HTTP::Cookies->new); $mech->get($base); $mech->follow_link( text => "Log in"); $mech->form_name("frm"); $mech->set_visible($username, $password); $mech->submit(); print "We should be authenticated now.\n"; $mech->follow_link( text => "$username"); print "At the base page.\n"; my @tag_links_all = $mech->links(); my @tag_links; print "Searching for tags.\n"; my $found_rss = 0; foreach (@tag_links_all) { if ($found_rss) { if ($_->url() =~ m/\/tag\//) { push (@tag_links, $_); print " Found tag " . $_->text() . ".\n"; } } elsif ($_->text() eq "RSS") { $found_rss = 1; } } #my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/tag\//); print "Found " . scalar(@tag_links) . " tags.\n"; my $tag_url; my $miss = 0; foreach $tag_url (@tag_links) { my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/; print " Fetching $tag.\n"; $mech->get($base); $mech->get($tag_url); my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/article\/\d+$/ ); print " Found " . scalar(@cite_links) . " citations.\n"; my $cite_url; foreach $cite_url (@cite_links) { print " Looking for article at: " . $cite_url->url() . "\n"; $mech->get($base); $mech->get($cite_url); if (!$mech->success()) { print " Not found.\n\n"; next; } my $title = $mech->title(); $title =~ s/CiteULike\: //; my $partial_hit_flag = 0; print " Fetched article $title.\n"; if (-e "$title.pdf" || -e "$title.$extension") { print " Article exists in print cache.\n"; } else { print " Retrieving article.\n"; my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/user\/$username\/.*\.pdf/i); if ($link) { $mech->get($link); print " Found a personal .pdf at " . $link->url() . ".\n"; $mech->save_content("$title.$extension"); print " Saved it.\n"; } else { print " Trying to follow an external link.\n"; # $mech->get($cite_url); # reset success() flag for (my $i = 0; $i < 6; $i++) { $link = $mech->find_link( url_regex => qr/\.pdf\s*$/ ); if ($link) { print " I found a direct link.\n"; $mech->get($link); $mech->save_content("$title.$extension"); print " Saved it.\n"; last; } else { print " Descending URL tree ($i).\n"; my @regexs = ( "View article online", "PDF", "Full text", "here" ); #my $link = $mech->find_link( text => "View article online" ); #if (!$link) { for (my $j = 0; $j < scalar(@regexs); $j++) { $link = $mech->find_link( text_regex => qr/$regexs[$j]/i); if ($link) { print " Found a \"" . $regexs[$j] . "\" link.\n"; $mech->get($link); $partial_hit_flag++; last; } } #} else { # print " Found \"View article online\" link.\n"; # $mech->get($link); #} } } #while ($link); } if (!-e "$title.pdf" && !-e "$title.$extension") { print " Couldn't find any match for:\n $title\n " . $cite_url->url() . " \n"; $miss++; open(MISSING, ">>$missing"); print MISSING ""; if ($partial_hit_flag > 0) { print MISSING "Please check following (partial hits $partial_hit_flag):
\n"; close(MISSING); } else { if ($print) { print " Printing $title.\n"; if (-e "tmp.ps") { system("rm \"tmp.ps\""); } if (-e "$title.pdf~") { print " Removed non-printed cache file.\n"; system("rm \"$title.pdf~\""); } system("acroread -toPostScript -start 1 -end 40 -pairs \"$title.pdf\" \"tmp.ps\""); print " Converted to .ps.\n"; sleep(3); system("lpr -P$printer \"tmp.ps\""); } } } print "\n"; } } print "Done. There were $miss articles I couldn't find.\n";
\n"; } print MISSING "url() . "\">$title