################################### bib_canonicalizer.pl #!/alruccabah/local/usr/bin/perl use strict; use warnings; #use diagnostics; my $username = "whatever"; my $base = "http://liinwww.ira.uka.de"; my $url = "$base/csbib"; my $citeulike = "http://www.citeulike.org/bibtex/user/"; my $input = $ARGV[0]; my $output = $ARGV[1]; die("Specify your citeulike username/tag path or an input .bib file.\n") unless ($input); use Text::BibTeX qw(:macrosubs); use LWP; use HTML::Strip; use URI::Escape; use String::Compare; my $agent = LWP::UserAgent->new(); my $stripper = HTML::Strip->new(); my %month = (jan => "January", feb => "February", mar => "March", apr => "April", may => "May", jun => "June", jul => "July", aug => "August", sep => "September", oct => "October", nov => "November", dec => "December"); my ($macro, $value); add_macro_text($macro, $value) while (($macro, $value) = each %month); #my $Input = new Text::BibTeX::File($input); my @Input; if ($input =~ m/^$username/) { print "Trying to retrieve from $citeulike$input.\n"; my $response = $agent->get($citeulike . $input, [do_username_prefix => 0, key_type => 0] ); open(TMP, ">tmp.bib"); print TMP $response->content; close(TMP); @Input = split(/\n\@/, $response->content); if (!$output) { ($output) = $input =~ m/.*\/(.*)/; $output .= ".bib"; } } else { die unless ($output); local $/ = undef; open(INPUT, "$input"); @Input = split(/\n\@/, ); } shift(@Input); open(OUT, ">$output"); #my $Output = new Text::BibTeX::File(">$output"); my $entry_text; my $pass = 0; my $entry; my ($failed, $total) = (0, 0); foreach $entry_text (@Input) { #print "$entry_text\n"; #exit(); $entry = new Text::BibTeX::Entry("@" . $entry_text); next unless $entry->parse_ok; close(OUT); my @names = $entry->names("author"); my $authors = ""; foreach (@names) { my ($tmp_author) = join(" ", $_->part("last")); $authors .= " " . $tmp_author; } my ($title) = $entry->get("title"); my ($year) = $entry->get("year"); my $new_entry; my $continue = 0; do { if ($pass == 0) { $new_entry = lookup_single($title, $authors); } elsif ($pass == 1) { $new_entry = lookup_single($title, $year); } elsif ($pass == 2) { $new_entry = lookup_single($authors, $year); } $pass++; print " Pass $pass.\n"; if ($new_entry) { $new_entry = new Text::BibTeX::Entry($new_entry); $new_entry->delete(["crossref"]); my $new_title = $new_entry->get("title"); if (compare($title, $new_title) > .5) { $continue = 1; } } } while ($pass < 3 && !$continue); if (!$continue) { print " Match fail.\n"; $new_entry = $entry; $failed++; } my %new_fieldlist; my @field_array = $new_entry->fieldlist; foreach (@field_array) { $new_fieldlist{$_} = 1; } foreach ($entry->fieldlist) { if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/) $new_entry->set($_, $entry->get($_)); } } open(OUT, ">>$output"); print OUT $new_entry->print_s; print OUT "\n\n"; print "Done with $title, $authors.\n\n"; $pass = 0; $total++; } close (OUT); print "Ran $total entries; $failed failed.\n"; sub lookup_single { my ($string_left, $string_right) = @_; if (!defined($string_right)) { $string_right = ""; } my $query; my $year = ""; if ($string_right =~ m/^\d\d\d\d$/) { $year = $string_right; $query = $string_left; } else { $query = "$string_left $string_right"; } $query =~ s/\{|\}//mg; $query =~ s/\.$//mg; my $response = $agent->post($url, [ maxnum => 10, query => $query, results => "citation", sort => "score", year => $year ] ); if ($response->content() =~ m/accesskey\=/) { my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg; #print "$base$bibtex_url\n"; #exit(); if (!defined($bibtex_url)) { print "Bibtex URL error on $string_left, $string_right.\n"; } else { $bibtex_url =~ s/\&\;/&/gmi; #stupid html escaping $response = $agent->get(uri_unescape("$base$bibtex_url")); my ($html) = $response->content() =~ m/.*?
(.*?)<\/pre>/sg;

			#print "$base$bibtex_url\n";
			#print $html; 
			#exit();

			if (!defined($html)) {
				print "
 tag parse error on $string_left, $string_right.\n";
			} else {
				return $stripper->parse($html);
			}
		}
	} else {
		print "No accesskey for $string_left, $string_right.\n";
	}

	return "";
}


#sub value_to_string {
#  my $string = "";
#	my $value = shift;
#	my @all_values = $value->values;
#	my $simpleval;
#	foreach $simpleval (@all_values) {
#	  $string .= $simpleval->text . " ";
#	}
#}



################################### citeulike_pdf_grabber.pl
#!/alruccabah/local/usr/bin/perl

use warnings;
use strict;

use WWW::Mechanize;

print "Starting up.\n";

my $username = "eweaver";
my $printer = "cis5";

my $password;
if (defined($ARGV[0])) {
  $password = $ARGV[0];
} else {
  die ("You need to supply a password, an optionally, the --print flag.\n");
}

my $extension = "pdf~";
my $print = 0;
if (defined($ARGV[1]) && $ARGV[1] eq "--print") {
  print "Printing enabled!\n";
	sleep(5);
  $print = 1;
	$extension = "pdf";
}


my $home = "http://www.citeulike.org/";
my $base = "$home/user/$username/";
my $mech = WWW::Mechanize->new(autocheck => 1);
my $print_cache = "/usa/$username/tmp/pdf_cache/";
chdir($print_cache);

my $missing = "missing.html";
open(MISSING, ">$missing");
close(MISSING);

print "Creating print cache...\n";
if (! -e $print_cache) {
	system("mkdir $print_cache");
} else {
  print "Exists.\n";
}

#$mech->cookie_jar(HTTP::Cookies->new);

$mech->get($base);
$mech->follow_link( text => "Log in");

$mech->form_name("frm");
$mech->set_visible($username, $password);
$mech->submit();

print "We should be authenticated now.\n";

$mech->follow_link( text => "$username");

print "At the base page.\n";

my @tag_links_all = $mech->links();
my @tag_links;

print "Searching for tags.\n";

my $found_rss = 0;
foreach (@tag_links_all) {
  if ($found_rss) {
	  if ($_->url() =~ m/\/tag\//) {
		  push (@tag_links, $_);
			print "  Found tag " . $_->text() . ".\n";
		}
	} elsif ($_->text() eq "RSS") {
	  $found_rss = 1;
	}
}
#my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/tag\//);

print "Found " . scalar(@tag_links) . " tags.\n";

my $tag_url;

my $miss = 0;

foreach $tag_url (@tag_links) {
  my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/;
	
  print "  Fetching $tag.\n";
	
	$mech->get($base);
	$mech->get($tag_url);
	
	my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/article\/\d+$/ );
  print "  Found " . scalar(@cite_links) . " citations.\n";
		
  my $cite_url;
	
	foreach $cite_url (@cite_links) {
	  print "    Looking for article at: " . $cite_url->url() . "\n";
	  $mech->get($base);
		$mech->get($cite_url);
		if (!$mech->success()) {
		  print "    Not found.\n\n";
			next;
		}
		
		my $title = $mech->title();
		$title =~ s/CiteULike\: //;
		
		my $partial_hit_flag = 0;
		
		print "    Fetched article $title.\n";
		if (-e "$title.pdf" || -e "$title.$extension") {
		  print "    Article exists in print cache.\n";
		} else {
		  print "    Retrieving article.\n";
			my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/user\/$username\/.*\.pdf/i);
			if ($link) {
			  $mech->get($link);
			  print "      Found a personal .pdf at " . $link->url() . ".\n";
				$mech->save_content("$title.$extension");
				print "      Saved it.\n";
		  } else {
			  print "      Trying to follow an external link.\n";
#			  $mech->get($cite_url); # reset success() flag

				for (my $i = 0; $i < 6; $i++) {
					$link = $mech->find_link( url_regex => qr/\.pdf\s*$/ );
					if ($link) {
				  	print "        I found a direct link.\n";
						$mech->get($link);
						$mech->save_content("$title.$extension");
				  	print "        Saved it.\n";
						last;
					} else {
  					print "        Descending URL tree ($i).\n";
						
						my @regexs = ( "View article online",
													 "PDF",
													 "Full text",
													 "here" );
						
  					#my $link = $mech->find_link( text => "View article online" );
						#if (!$link) {
						  for (my $j = 0; $j < scalar(@regexs); $j++) {
						  	$link = $mech->find_link(  text_regex => qr/$regexs[$j]/i);
								if ($link) {
		  						print "        Found a \"" . $regexs[$j] . "\" link.\n";
							  	$mech->get($link);
									$partial_hit_flag++;
									last;
								}
						  }
						#} else {
						#  print "        Found \"View article online\" link.\n";
						#	$mech->get($link);
						#}
					}
				} #while ($link);
			}
			if (!-e "$title.pdf" && !-e "$title.$extension")	{
			  print  "  Couldn't find any match for:\n    $title\n    " . $cite_url->url() . "    \n";
			  $miss++;
				open(MISSING, ">>$missing");
				print MISSING "

"; if ($partial_hit_flag > 0) { print MISSING "Please check following (partial hits $partial_hit_flag):
\n"; } print MISSING "url() . "\">$title

\n"; close(MISSING); } else { if ($print) { print " Printing $title.\n"; if (-e "tmp.ps") { system("rm \"tmp.ps\""); } if (-e "$title.pdf~") { print " Removed non-printed cache file.\n"; system("rm \"$title.pdf~\""); } system("acroread -toPostScript -start 1 -end 40 -pairs \"$title.pdf\" \"tmp.ps\""); print " Converted to .ps.\n"; sleep(3); system("lpr -P$printer \"tmp.ps\""); } } } print "\n"; } } print "Done. There were $miss articles I couldn't find.\n";