################################### bib_canonicalizer.pl
#!/alruccabah/local/usr/bin/perl
use strict;
use warnings;
#use diagnostics;
my $username = "whatever";
my $base = "http://liinwww.ira.uka.de";
my $url = "$base/csbib";
my $citeulike = "http://www.citeulike.org/bibtex/user/";
my $input = $ARGV[0];
my $output = $ARGV[1];
die("Specify your citeulike username/tag path or an input .bib file.\n
+") unless ($input);
use Text::BibTeX qw(:macrosubs);
use LWP;
use HTML::Strip;
use URI::Escape;
use String::Compare;
my $agent = LWP::UserAgent->new();
my $stripper = HTML::Strip->new();
my %month = (jan => "January", feb => "February", mar => "March",
apr => "April", may => "May", jun => "June", jul => "July",
aug => "August", sep => "September", oct => "October",
nov => "November", dec => "December");
my ($macro, $value);
add_macro_text($macro, $value) while (($macro, $value) = each %month);
#my $Input = new Text::BibTeX::File($input);
my @Input;
if ($input =~ m/^$username/) {
print "Trying to retrieve from $citeulike$input.\n";
my $response = $agent->get($citeulike . $input,
[do_username_
+prefix => 0,
key_type => 0
+]
);
open(TMP, ">tmp.bib");
print TMP $response->content;
close(TMP);
@Input = split(/\n\@/, $response->content);
if (!$output) {
($output) = $input =~ m/.*\/(.*)/;
$output .= ".bib";
}
} else {
die unless ($output);
local $/ = undef;
open(INPUT, "$input");
@Input = split(/\n\@/, <INPUT>);
}
shift(@Input);
open(OUT, ">$output");
#my $Output = new Text::BibTeX::File(">$output");
my $entry_text;
my $pass = 0;
my $entry;
my ($failed, $total) = (0, 0);
foreach $entry_text (@Input) {
#print "$entry_text\n";
#exit();
$entry = new Text::BibTeX::Entry("@" . $entry_text);
next unless $entry->parse_ok;
close(OUT);
my @names = $entry->names("author");
my $authors = "";
foreach (@names) {
my ($tmp_author) = join(" ", $_->part("last"));
$authors .= " " . $tmp_author;
}
my ($title) = $entry->get("title");
my ($year) = $entry->get("year");
my $new_entry;
my $continue = 0;
do {
if ($pass == 0) {
$new_entry = lookup_single($title, $authors);
} elsif ($pass == 1) {
$new_entry = lookup_single($title, $year);
} elsif ($pass == 2) {
$new_entry = lookup_single($authors, $year);
}
$pass++;
print " Pass $pass.\n";
if ($new_entry) {
$new_entry = new Text::BibTeX::Entry($new_entry);
$new_entry->delete(["crossref"]);
my $new_title = $new_entry->get("title");
if (compare($title, $new_title) > .5) {
$continue = 1;
}
}
} while ($pass < 3 && !$continue);
if (!$continue) {
print " Match fail.\n";
$new_entry = $entry;
$failed++;
}
my %new_fieldlist;
my @field_array = $new_entry->fieldlist;
foreach (@field_array) {
$new_fieldlist{$_} = 1;
}
foreach ($entry->fieldlist) {
if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/)
$new_entry->set($_, $entry->get($_));
}
}
open(OUT, ">>$output");
print OUT $new_entry->print_s;
print OUT "\n\n";
print "Done with $title, $authors.\n\n";
$pass = 0;
$total++;
}
close (OUT);
print "Ran $total entries; $failed failed.\n";
sub lookup_single {
my ($string_left, $string_right) = @_;
if (!defined($string_right)) {
$string_right = "";
}
my $query;
my $year = "";
if ($string_right =~ m/^\d\d\d\d$/) {
$year = $string_right;
$query = $string_left;
} else {
$query = "$string_left $string_right";
}
$query =~ s/\{|\}//mg;
$query =~ s/\.$//mg;
my $response = $agent->post($url,
[ maxnum => 10,
query => $query,
results => "citation",
sort => "score",
year => $year
]
);
if ($response->content() =~ m/accesskey\=/) {
my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".
+*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg;
#print "$base$bibtex_url\n";
#exit();
if (!defined($bibtex_url)) {
print "Bibtex URL error on $string_left, $string_right.\n"
+;
} else {
$bibtex_url =~ s/\&\;/&/gmi; #stupid html escaping
$response = $agent->get(uri_unescape("$base$bibtex_url"));
my ($html) = $response->content() =~ m/.*?<pre class=\"bib
+tex\">(.*?)<\/pre>/sg;
#print "$base$bibtex_url\n";
#print $html;
#exit();
if (!defined($html)) {
print "<pre> tag parse error on $string_left, $string_
+right.\n";
} else {
return $stripper->parse($html);
}
}
} else {
print "No accesskey for $string_left, $string_right.\n";
}
return "";
}
#sub value_to_string {
# my $string = "";
# my $value = shift;
# my @all_values = $value->values;
# my $simpleval;
# foreach $simpleval (@all_values) {
# $string .= $simpleval->text . " ";
# }
#}
################################### citeulike_pdf_grabber.pl
#!/alruccabah/local/usr/bin/perl
use warnings;
use strict;
use WWW::Mechanize;
print "Starting up.\n";
my $username = "eweaver";
my $printer = "cis5";
my $password;
if (defined($ARGV[0])) {
$password = $ARGV[0];
} else {
die ("You need to supply a password, an optionally, the --print flag
+.\n");
}
my $extension = "pdf~";
my $print = 0;
if (defined($ARGV[1]) && $ARGV[1] eq "--print") {
print "Printing enabled!\n";
sleep(5);
$print = 1;
$extension = "pdf";
}
my $home = "http://www.citeulike.org/";
my $base = "$home/user/$username/";
my $mech = WWW::Mechanize->new(autocheck => 1);
my $print_cache = "/usa/$username/tmp/pdf_cache/";
chdir($print_cache);
my $missing = "missing.html";
open(MISSING, ">$missing");
close(MISSING);
print "Creating print cache...\n";
if (! -e $print_cache) {
system("mkdir $print_cache");
} else {
print "Exists.\n";
}
#$mech->cookie_jar(HTTP::Cookies->new);
$mech->get($base);
$mech->follow_link( text => "Log in");
$mech->form_name("frm");
$mech->set_visible($username, $password);
$mech->submit();
print "We should be authenticated now.\n";
$mech->follow_link( text => "$username");
print "At the base page.\n";
my @tag_links_all = $mech->links();
my @tag_links;
print "Searching for tags.\n";
my $found_rss = 0;
foreach (@tag_links_all) {
if ($found_rss) {
if ($_->url() =~ m/\/tag\//) {
push (@tag_links, $_);
print " Found tag " . $_->text() . ".\n";
}
} elsif ($_->text() eq "RSS") {
$found_rss = 1;
}
}
#my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$usernam
+e\/tag\//);
print "Found " . scalar(@tag_links) . " tags.\n";
my $tag_url;
my $miss = 0;
foreach $tag_url (@tag_links) {
my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/;
print " Fetching $tag.\n";
$mech->get($base);
$mech->get($tag_url);
my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$use
+rname\/article\/\d+$/ );
print " Found " . scalar(@cite_links) . " citations.\n";
my $cite_url;
foreach $cite_url (@cite_links) {
print " Looking for article at: " . $cite_url->url() . "\n";
$mech->get($base);
$mech->get($cite_url);
if (!$mech->success()) {
print " Not found.\n\n";
next;
}
my $title = $mech->title();
$title =~ s/CiteULike\: //;
my $partial_hit_flag = 0;
print " Fetched article $title.\n";
if (-e "$title.pdf" || -e "$title.$extension") {
print " Article exists in print cache.\n";
} else {
print " Retrieving article.\n";
my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/us
+er\/$username\/.*\.pdf/i);
if ($link) {
$mech->get($link);
print " Found a personal .pdf at " . $link->url() .
+ ".\n";
$mech->save_content("$title.$extension");
print " Saved it.\n";
} else {
print " Trying to follow an external link.\n";
# $mech->get($cite_url); # reset success() flag
for (my $i = 0; $i < 6; $i++) {
$link = $mech->find_link( url_regex => qr/\.pdf\s*
+$/ );
if ($link) {
print " I found a direct link.\n";
$mech->get($link);
$mech->save_content("$title.$extension");
print " Saved it.\n";
last;
} else {
print " Descending URL tree ($i).\n";
my @regexs = ( "View article online",
"PDF",
"Full text",
"here" );
#my $link = $mech->find_link( text => "View arti
+cle online" );
#if (!$link) {
for (my $j = 0; $j < scalar(@regexs); $j++)
+{
$link = $mech->find_link( text_regex =>
+ qr/$regexs[$j]/i);
if ($link) {
print " Found a \"" . $regexs
+[$j] . "\" link.\n";
$mech->get($link);
$partial_hit_flag++;
last;
}
}
#} else {
# print " Found \"View article online\
+" link.\n";
# $mech->get($link);
#}
}
} #while ($link);
}
if (!-e "$title.pdf" && !-e "$title.$extension") {
print " Couldn't find any match for:\n $title\n
+" . $cite_url->url() . " \n";
$miss++;
open(MISSING, ">>$missing");
print MISSING "<p>";
if ($partial_hit_flag > 0) {
print MISSING "<b>Please check following (partial
+hits $partial_hit_flag):</b><br>\n";
}
print MISSING "<a href=\"" . $home . $cite_url->url()
+. "\">$title</a></p>\n";
close(MISSING);
} else {
if ($print) {
print " Printing $title.\n";
if (-e "tmp.ps") {
system("rm \"tmp.ps\"");
}
if (-e "$title.pdf~") {
print " Removed non-printed cache file.\n";
system("rm \"$title.pdf~\"");
}
system("acroread -toPostScript -start 1 -end 40 -p
+airs \"$title.pdf\" \"tmp.ps\"");
print " Converted to .ps.\n";
sleep(3);
system("lpr -P$printer \"tmp.ps\"");
}
}
}
print "\n";
}
}
print "Done. There were $miss articles I couldn't find.\n";
|