################################### bib_canonicalizer.pl
#!/alruccabah/local/usr/bin/perl

use strict;
use warnings;
#use diagnostics;

my $username = "whatever";

my $base = "http://liinwww.ira.uka.de";
my $url = "$base/csbib";

my $citeulike = "http://www.citeulike.org/bibtex/user/";

my $input = $ARGV[0];
my $output = $ARGV[1];

die("Specify your citeulike username/tag path or an input .bib file.\n") unless ($input);

use Text::BibTeX qw(:macrosubs);
use LWP;
use HTML::Strip;
use URI::Escape;
use String::Compare;

my $agent = LWP::UserAgent->new();
my $stripper = HTML::Strip->new();

my %month = (jan => "January", feb => "February", mar => "March",
	 apr => "April", may => "May", jun => "June", jul => "July", 
	 aug => "August", sep => "September", oct => "October",
	 nov => "November", dec => "December");
my ($macro, $value);
add_macro_text($macro, $value) while (($macro, $value) = each %month);

#my $Input = new Text::BibTeX::File($input);

my @Input;

if ($input =~ m/^$username/) {
  print "Trying to retrieve from $citeulike$input.\n";
  my $response = $agent->get($citeulike . $input,
														 [do_username_prefix => 0,
														 key_type => 0]
														 );
	open(TMP, ">tmp.bib");
	print TMP $response->content;
	close(TMP);
	
	@Input = split(/\n\@/, $response->content);
	
	if (!$output) {
	  ($output) = $input =~ m/.*\/(.*)/;
		$output .= ".bib";
	}
} else {
  die unless ($output);
	
	local $/ = undef;
	open(INPUT, "$input");
  @Input = split(/\n\@/, <INPUT>);
}

shift(@Input);


open(OUT, ">$output");

#my $Output = new Text::BibTeX::File(">$output");

my $entry_text;
my $pass = 0;
my $entry;
my ($failed, $total) = (0, 0);

foreach $entry_text (@Input) {
  #print "$entry_text\n";
	#exit();

 	$entry = new Text::BibTeX::Entry("@" . $entry_text);
	next unless $entry->parse_ok; 
	
	close(OUT);

	my @names = $entry->names("author");
	my $authors = "";

	foreach (@names) {
	  my ($tmp_author) = join(" ", $_->part("last"));
		$authors .= " " . $tmp_author;
	}

	my ($title) = $entry->get("title");
	my ($year) = $entry->get("year");

	my $new_entry;
	
	my $continue = 0;

	do {
	
		if ($pass == 0) {
  		$new_entry = lookup_single($title, $authors);
		} elsif ($pass == 1) {
	  	$new_entry = lookup_single($title, $year);
		} elsif ($pass == 2) {
	  	$new_entry = lookup_single($authors, $year);
		}

		$pass++;

		print "  Pass $pass.\n";
		
		if ($new_entry) {
		
		  $new_entry = new Text::BibTeX::Entry($new_entry);

			$new_entry->delete(["crossref"]);

			my $new_title = $new_entry->get("title");
			if (compare($title, $new_title) > .5) {
		  	$continue = 1;
			}
    }
	} while ($pass < 3 && !$continue);
	
	if (!$continue) {
	    print "  Match fail.\n";
	   	$new_entry = $entry;
			$failed++;
	}
	
	my %new_fieldlist;
	my @field_array = $new_entry->fieldlist;
	foreach (@field_array) {
	  $new_fieldlist{$_} = 1;
	}
	
	foreach ($entry->fieldlist) {
	  if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/)
		  $new_entry->set($_, $entry->get($_));
	  }
	}
	
	open(OUT, ">>$output");
	print OUT $new_entry->print_s;
	print OUT "\n\n";
	
	print "Done with $title, $authors.\n\n";

	$pass = 0;
	$total++;

}

close (OUT);

print "Ran $total entries; $failed failed.\n";

sub lookup_single {
  my ($string_left, $string_right) = @_;
	
	if (!defined($string_right)) {
	  $string_right = "";
	}
	
	my $query;
	my $year = "";
	if ($string_right =~ m/^\d\d\d\d$/) {
	  $year = $string_right;
	  $query = $string_left;
	} else {
	  $query = "$string_left $string_right";
	}
	
	$query =~ s/\{|\}//mg;
	$query =~ s/\.$//mg;
	
	my $response = $agent->post($url,
		[ maxnum => 10,
			query => $query,
			results => "citation",
			sort => "score",
			year => $year
		]
	  );



	if ($response->content() =~ m/accesskey\=/) {
		my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg;

		#print "$base$bibtex_url\n";
		#exit();
		if (!defined($bibtex_url)) {
			print "Bibtex URL error on $string_left, $string_right.\n";
		} else {

			$bibtex_url =~ s/\&amp\;/&/gmi; #stupid html escaping

			$response = $agent->get(uri_unescape("$base$bibtex_url"));
			my ($html) = $response->content() =~ m/.*?<pre class=\"bibtex\">(.*?)<\/pre>/sg;

			#print "$base$bibtex_url\n";
			#print $html; 
			#exit();

			if (!defined($html)) {
				print "<pre> tag parse error on $string_left, $string_right.\n";
			} else {
				return $stripper->parse($html);
			}
		}
	} else {
		print "No accesskey for $string_left, $string_right.\n";
	}

	return "";
}


#sub value_to_string {
#  my $string = "";
#	my $value = shift;
#	my @all_values = $value->values;
#	my $simpleval;
#	foreach $simpleval (@all_values) {
#	  $string .= $simpleval->text . " ";
#	}
#}



################################### citeulike_pdf_grabber.pl
#!/alruccabah/local/usr/bin/perl

use warnings;
use strict;

use WWW::Mechanize;

print "Starting up.\n";

my $username = "eweaver";
my $printer = "cis5";

my $password;
if (defined($ARGV[0])) {
  $password = $ARGV[0];
} else {
  die ("You need to supply a password, an optionally, the --print flag.\n");
}

my $extension = "pdf~";
my $print = 0;
if (defined($ARGV[1]) && $ARGV[1] eq "--print") {
  print "Printing enabled!\n";
	sleep(5);
  $print = 1;
	$extension = "pdf";
}


my $home = "http://www.citeulike.org/";
my $base = "$home/user/$username/";
my $mech = WWW::Mechanize->new(autocheck => 1);
my $print_cache = "/usa/$username/tmp/pdf_cache/";
chdir($print_cache);

my $missing = "missing.html";
open(MISSING, ">$missing");
close(MISSING);

print "Creating print cache...\n";
if (! -e $print_cache) {
	system("mkdir $print_cache");
} else {
  print "Exists.\n";
}

#$mech->cookie_jar(HTTP::Cookies->new);

$mech->get($base);
$mech->follow_link( text => "Log in");

$mech->form_name("frm");
$mech->set_visible($username, $password);
$mech->submit();

print "We should be authenticated now.\n";

$mech->follow_link( text => "$username");

print "At the base page.\n";

my @tag_links_all = $mech->links();
my @tag_links;

print "Searching for tags.\n";

my $found_rss = 0;
foreach (@tag_links_all) {
  if ($found_rss) {
	  if ($_->url() =~ m/\/tag\//) {
		  push (@tag_links, $_);
			print "  Found tag " . $_->text() . ".\n";
		}
	} elsif ($_->text() eq "RSS") {
	  $found_rss = 1;
	}
}
#my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/tag\//);

print "Found " . scalar(@tag_links) . " tags.\n";

my $tag_url;

my $miss = 0;

foreach $tag_url (@tag_links) {
  my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/;
	
  print "  Fetching $tag.\n";
	
	$mech->get($base);
	$mech->get($tag_url);
	
	my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$username\/article\/\d+$/ );
  print "  Found " . scalar(@cite_links) . " citations.\n";
		
  my $cite_url;
	
	foreach $cite_url (@cite_links) {
	  print "    Looking for article at: " . $cite_url->url() . "\n";
	  $mech->get($base);
		$mech->get($cite_url);
		if (!$mech->success()) {
		  print "    Not found.\n\n";
			next;
		}
		
		my $title = $mech->title();
		$title =~ s/CiteULike\: //;
		
		my $partial_hit_flag = 0;
		
		print "    Fetched article $title.\n";
		if (-e "$title.pdf" || -e "$title.$extension") {
		  print "    Article exists in print cache.\n";
		} else {
		  print "    Retrieving article.\n";
			my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/user\/$username\/.*\.pdf/i);
			if ($link) {
			  $mech->get($link);
			  print "      Found a personal .pdf at " . $link->url() . ".\n";
				$mech->save_content("$title.$extension");
				print "      Saved it.\n";
		  } else {
			  print "      Trying to follow an external link.\n";
#			  $mech->get($cite_url); # reset success() flag

				for (my $i = 0; $i < 6; $i++) {
					$link = $mech->find_link( url_regex => qr/\.pdf\s*$/ );
					if ($link) {
				  	print "        I found a direct link.\n";
						$mech->get($link);
						$mech->save_content("$title.$extension");
				  	print "        Saved it.\n";
						last;
					} else {
  					print "        Descending URL tree ($i).\n";
						
						my @regexs = ( "View article online",
													 "PDF",
													 "Full text",
													 "here" );
						
  					#my $link = $mech->find_link( text => "View article online" );
						#if (!$link) {
						  for (my $j = 0; $j < scalar(@regexs); $j++) {
						  	$link = $mech->find_link(  text_regex => qr/$regexs[$j]/i);
								if ($link) {
		  						print "        Found a \"" . $regexs[$j] . "\" link.\n";
							  	$mech->get($link);
									$partial_hit_flag++;
									last;
								}
						  }
						#} else {
						#  print "        Found \"View article online\" link.\n";
						#	$mech->get($link);
						#}
					}
				} #while ($link);
			}
			if (!-e "$title.pdf" && !-e "$title.$extension")	{
			  print  "  Couldn't find any match for:\n    $title\n    " . $cite_url->url() . "    \n";
			  $miss++;
				open(MISSING, ">>$missing");
				print MISSING "<p>";
				if ($partial_hit_flag > 0) {
					print MISSING "<b>Please check following (partial hits $partial_hit_flag):</b><br>\n";
				}
				print MISSING "<a href=\"" . $home . $cite_url->url() . "\">$title</a></p>\n";
				close(MISSING);
			} else {
			  if ($print) {
				  print "  Printing $title.\n";
					if (-e "tmp.ps") {
						system("rm \"tmp.ps\"");
					}
					if (-e "$title.pdf~") {
					  print "  Removed non-printed cache file.\n";
						system("rm \"$title.pdf~\"");
					}
					system("acroread -toPostScript -start 1 -end 40 -pairs \"$title.pdf\" \"tmp.ps\"");
					print "  Converted to .ps.\n";
					sleep(3);
					system("lpr -P$printer \"tmp.ps\"");
				}
			}
		}
	print "\n";
	}
}				
			
print "Done. There were $miss articles I couldn't find.\n";