#!/usr/bin/perl # k5download.pl # This has the same license as Perl. use strict; use warnings; # Probably needs to install these modules. # See the cpan command or your local variant like apt-get # for Debian based systems and PPM for Win. All these should # be in Debian Testing/Ubuntu. use HTML::TokeParser; # In HTML::Parser. use WWW::Mechanize; use Getopt::Long; use Date::Manip; use YAML; # ------------------------------------------------ our $VERSION = '0.01'; use constant TIME_BETWEEN_PAGELOADS => 4; use constant SCOOP_WEB_SITE => "http://www.kuro5hin.org"; # ------------------------------------------------ # Help text: my $usage = " This program is used to screen scrape 'kuro5hin.org' for your comments. You own your comments on K5 but not other people's comments, so this just supports getting what you have written yourself. It d/l the full html text, so do NOT use 'nested' for 'Comment display mode'! It hurts usability for grep-ping (searching) your saved comments, anyway. Use: perl k5download.pl OPTIONS ... Options are: -u username Specify the username at kuro5hin. -p password Password so the program can log in. --dump dir Save all your comments in dir. -n no Just get the 'no' last comments. -log Log what's done on STD ERR. (Simple debug.) --evil Usually, the program waits a bit between every read from K5. Not with this flag. (-: If you use it, I'll laugh if rusty cancels your account. :-) --help Shows this text. '-p' and '--dump' are mandatory. If no user name is specified, the login name will be used (-: that should work for rusty, at least :-). Examples of use: Dumps html of ALL posts to K5 in dir 'foo'. perl k5download.pl -u bar -p XX --dump 'foo' --log Gets html of the last 200 posts to K5 in a dir. perl k5download.pl -u bar -p XX --dump foo --log -n 200 To Do: - Utility to parse out the html to textfiles, so they are easier to grep (search, for win people). 3vial. - Utility that d/l new things you've written (and when you get comments and mods on old stuff) and updates without d/l everything! - Utility that d/l modding and finds who modbombs you (Hello 'V' and 'jxg'!) - Utility that parse data to find sniper comments added much later, to get the last word (Hello, 'A Bore'!) - Remove warning when d/l comment list. :-) Notes: - The generated yaml file contains all the info an updater program would need -- and it is both human readable and not Perl specific. Have fun! - Not tested on Win; sorry, I can only run Unix stuff at home. - Don't use the same dir for multiple downloads. An information file will be overwritten. - This will certainly needs modification to work with other Scoop sites. - I should have written an extension to Scoop and published instead, which rusty could have installed. More efficient. "; # ------------------------------------------------ # Parse parameters: my($username, $password, $dir, $max_comments, $log, $evil, $help); my($rslt) =GetOptions('u|user:s' => \$username, 'p|pwd=s' => \$password, 'h|help' => \$help, evil => \$evil, 'n|maxno:i' => \$max_comments, 'dump:s' => \$dir, log => \$log, ); die "Bad parameters?!$usage" unless $rslt; if ($help) { print $usage; exit 0; } die "Password is mandatory. Try '--help'" unless $password; $username = `whoami` unless $username; die "Mandatory with '--dump'" unless $dir; # Try to create dir before slow && painful d/l: chop $dir if $dir =~ m!/$!; mkdir $dir unless -d $dir; die "'--dump' can't make dir '$dir'" unless -d $dir; # ------------------------------------------------ # Init and try to login at the site: my $mech = WWW::Mechanize->new(); $mech->stack_depth(2); # Don't use so much RAM $mech->get(SCOOP_WEB_SITE . "?uname=$username&pass=$password"); die "Failed to login with user $username" if $mech->content() !~ /Logout from all locations/ && $mech->content() !~ /Logout $username/; print STDERR "Logged in\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; # ------------------------------------------------ # Read down Comment list: my($cmts) = []; # Store comment info here load_from_comment_list($mech, $username, $cmts, $max_comments, $log, $evil); # Note for you guys with multiple accounts. You could just add # calls here. # ------------------------------------------------ # Get all comments. # (Already checked so there is a dir $dir.) # (Start with earliest days, so numbering of comments a given day # is always the same even if adds comments the same day.) foreach my $c (reverse @$cmts) { $mech->get(SCOOP_WEB_SITE . $c->{url}); # - - - Create file name: my($date) = ParseDate($c->{date}); my($fname) = UnixDate($date, "%y-%m-%d"); my($no) = 1; while (-f "$dir/$fname-$no.html") { $no++; } $c->{fname} = "$fname-$no.html"; $fname = "$dir/$fname-$no.html"; # - - - Log, etc: print STDERR "Got $fname " . $c->{url} ."\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; # - - - Prepare data for storing: my($html) = $mech->content(); # Add html comment with easily parsed known info about comment: my($text) = sprintf "mod %s,%s. repl %s date %s, url '%s'" . " story %s title '%s'.", $c->{mods}, $c->{mod}, $c->{answers}, $c->{date}, $c->{url}, $c->{storyurl}, $c->{title}; $text =~ s/-->/-->/g; die "Couldn't find '' in " . $c->{url} unless $html =~ s@ ', $fname)) { print $fh $html; close $fh; } else { warn "Couldn't save to file '$fname'"; } } # ------------------------------------------------ # Dump YAML Data description (important) # This yaml file makes an update script trivial to write. YAML::DumpFile("$dir/comments.yaml", $cmts); # ------------------------------------------------ # Dump comment list as simple html. # Make utility html list with references to all comments. print STDERR "Save list to $dir/comment-list.html\n" if $log; if (open(my $fh, '>', "$dir/comment-list.html")) { print $fh "
';
print $fh '',
$c->{title}, ' on ', $c->{date};
print $fh "
\n";
my($mod) = $c->{mod} ? $c->{mod} : '';
printf $fh 'Rated by %s for %s, %s replies.',
SCOOP_WEB_SITE . $c->{modurl}, $c->{mods},
$mod, $c->{answers};
printf $fh ' Story %s',
SCOOP_WEB_SITE . $c->{storyurl}, $c->{storyname};
print $fh "