#!/usr/bin/perl
# k5download.pl
# This has the same license as Perl.
use strict;
use warnings;
# Probably needs to install these modules.
# See the cpan command or your local variant like apt-get
# for Debian based systems and PPM for Win. All these should
# be in Debian Testing/Ubuntu.
use HTML::TokeParser; # In HTML::Parser.
use WWW::Mechanize;
use Getopt::Long;
use Date::Manip;
use YAML;
# ------------------------------------------------
our $VERSION = '0.01';
use constant TIME_BETWEEN_PAGELOADS => 4;
use constant SCOOP_WEB_SITE => "http://www.kuro5hin.org";
# ------------------------------------------------
# Help text:
my $usage = "
This program is used to screen scrape 'kuro5hin.org' for your comments
+.
You own your comments on K5 but not other people's comments, so this
just supports getting what you have written yourself. It d/l the full
html text, so do NOT use 'nested' for 'Comment display mode'! It hurts
usability for grep-ping (searching) your saved comments, anyway.
Use: perl k5download.pl OPTIONS ...
Options are:
-u username Specify the username at kuro5hin.
-p password Password so the program can log in.
--dump dir Save all your comments in dir.
-n no Just get the 'no' last comments.
-log Log what's done on STD ERR. (Simple debug.)
--evil Usually, the program waits a bit between every read
from K5. Not with this flag. (-: If you use it, I'll
laugh if rusty cancels your account. :-)
--help Shows this text.
'-p' and '--dump' are mandatory.
If no user name is specified, the login name will be used (-: that
should work for rusty, at least :-).
Examples of use:
Dumps html of ALL posts to K5 in dir 'foo'.
perl k5download.pl -u bar -p XX --dump 'foo' --log
Gets html of the last 200 posts to K5 in a dir.
perl k5download.pl -u bar -p XX --dump foo --log -n 200
To Do:
- Utility to parse out the html to textfiles, so they are easier to
grep (search, for win people). 3vial.
- Utility that d/l new things you've written (and when you get comment
+s
and mods on old stuff) and updates without d/l everything!
- Utility that d/l modding and finds who modbombs you (Hello 'V' and
'jxg'!)
- Utility that parse data to find sniper comments added much later, to
get the last word (Hello, 'A Bore'!)
- Remove warning when d/l comment list. :-)
Notes:
- The generated yaml file contains all the info an updater program wou
+ld
need -- and it is both human readable and not Perl specific. Have fu
+n!
- Not tested on Win; sorry, I can only run Unix stuff at home.
- Don't use the same dir for multiple downloads. An information file
will be overwritten.
- This will certainly needs modification to work with other Scoop site
+s.
- I should have written an extension to Scoop and published instead,
which rusty could have installed. More efficient.
";
# ------------------------------------------------
# Parse parameters:
my($username, $password, $dir,
$max_comments, $log, $evil, $help);
my($rslt) =GetOptions('u|user:s' => \$username,
'p|pwd=s' => \$password,
'h|help' => \$help,
evil => \$evil,
'n|maxno:i' => \$max_comments,
'dump:s' => \$dir,
log => \$log,
);
die "Bad parameters?!$usage" unless $rslt;
if ($help) {
print $usage;
exit 0;
}
die "Password is mandatory. Try '--help'" unless $password;
$username = `whoami` unless $username;
die "Mandatory with '--dump'" unless $dir;
# Try to create dir before slow && painful d/l:
chop $dir if $dir =~ m!/$!;
mkdir $dir unless -d $dir;
die "'--dump' can't make dir '$dir'" unless -d $dir;
# ------------------------------------------------
# Init and try to login at the site:
my $mech = WWW::Mechanize->new();
$mech->stack_depth(2); # Don't use so much RAM
$mech->get(SCOOP_WEB_SITE . "?uname=$username&pass=$password");
die "Failed to login with user $username"
if $mech->content() !~ /Logout from all locations/ &&
$mech->content() !~ /Logout $username/;
print STDERR "Logged in\n" if $log;
sleep TIME_BETWEEN_PAGELOADS unless $evil;
# ------------------------------------------------
# Read down Comment list:
my($cmts) = []; # Store comment info here
load_from_comment_list($mech, $username, $cmts,
$max_comments, $log, $evil);
# Note for you guys with multiple accounts. You could just add
# calls here.
# ------------------------------------------------
# Get all comments.
# (Already checked so there is a dir $dir.)
# (Start with earliest days, so numbering of comments a given day
# is always the same even if adds comments the same day.)
foreach my $c (reverse @$cmts) {
$mech->get(SCOOP_WEB_SITE . $c->{url});
# - - - Create file name:
my($date) = ParseDate($c->{date});
my($fname) = UnixDate($date, "%y-%m-%d");
my($no) = 1;
while (-f "$dir/$fname-$no.html") {
$no++;
}
$c->{fname} = "$fname-$no.html";
$fname = "$dir/$fname-$no.html";
# - - - Log, etc:
print STDERR "Got $fname " . $c->{url} ."\n" if $log;
sleep TIME_BETWEEN_PAGELOADS unless $evil;
# - - - Prepare data for storing:
my($html) = $mech->content();
# Add html comment with easily parsed known info about comment:
my($text) = sprintf "mod %s,%s. repl %s date %s, url '%s'"
. " story %s title '%s'.",
$c->{mods}, $c->{mod}, $c->{answers},
$c->{date}, $c->{url},
$c->{storyurl}, $c->{title};
$text =~ s/-->/-->/g;
die "Couldn't find '</head>' in " . $c->{url}
unless $html =~ s@</HEAD@<!-- $text --> </HEAD@i;
# - - - Store:
if (open(my $fh, '>', $fname)) {
print $fh $html;
close $fh;
} else {
warn "Couldn't save to file '$fname'";
}
}
# ------------------------------------------------
# Dump YAML Data description (important)
# This yaml file makes an update script trivial to write.
YAML::DumpFile("$dir/comments.yaml", $cmts);
# ------------------------------------------------
# Dump comment list as simple html.
# Make utility html list with references to all comments.
print STDERR "Save list to $dir/comment-list.html\n" if $log;
if (open(my $fh, '>', "$dir/comment-list.html")) {
print $fh "
<html><head><TITLE>Comments By $username</title><body>
";
foreach my $c (@$cmts) {
print $fh '<p>';
print $fh '<a href="', SCOOP_WEB_SITE, $c->{url}, '">',
$c->{title}, '</a> on ', $c->{date};
print $fh "<br />\n";
my($mod) = $c->{mod} ? $c->{mod} : '';
printf $fh 'Rated by <a href="%s">%s</a> for %s, %s replies.',
SCOOP_WEB_SITE . $c->{modurl}, $c->{mods},
$mod, $c->{answers};
printf $fh ' Story <a href="%s">%s</a>',
SCOOP_WEB_SITE . $c->{storyurl}, $c->{storyname};
print $fh "</p>\n";
}
print $fh "\n</body></html>\n";
close $fh;
} else {
warn "ERR! Couldn't save to '$dir/comment-list.html'!?\n";
}
# ------------------------------------------------
# Get Comment list from Scoop site:
sub load_from_comment_list {
my($mech, $username, $cmts, $max_comments, $log, $evil) = @_;
my($cmts_url) = SCOOP_WEB_SITE . "/user/$username/comments";
# Suitable Tst; user with quite few posts:
#$cmts_url = SCOOP_WEB_SITE . "/user/Zombie%20Abu%20Musab%2
+0al%20Zarqawi/comments";
# - - - Read first page of comment list:
$mech->get($cmts_url);
# XXX Err test for success here!
print STDERR "Got 1st comment list page\n" if $log;
sleep TIME_BETWEEN_PAGELOADS unless $evil;
# - - - Loop over comment list pages and get comment refs:
while (1) {
my($no_cmts)= scalar(@$cmts);
last if $max_comments && $max_comments < $no_cmts;
my($content)= $mech->content();
my($cmtinfo)= parse_cmtlist(\$content, $cmts);
my($c_now) = scalar(@$cmts);
print STDERR "Has read $c_now cmts\n" if $log;
# - - - Go to next page, if any for comment list:
# (Quite new in WWW::Mechanize API, so don't use it.)
#my($form) = $mech->form_with_fields('next');
# This will give a warning, but sigh...
my($form);
for(my $i = 0; $i < 4; $i++) {
$form = $mech->form_number($i);
last unless $form;
last if $form->find_input('next');
}
last unless $form; # No more data
$mech->select('count', '50');
$mech->click_button(name => 'next');
print STDERR "Got next comment page\n" if $log;
sleep TIME_BETWEEN_PAGELOADS unless $evil;
}
}
# ------------------------------------------------
# Get Comment info from Comment list page
sub parse_cmtlist {
my($data, $results) = @_;
my $parse = HTML::TokeParser->new($data);
# Skip everything before comment list:
my($flag, $tok);
while ( $tok = $parse->get_tag( 'a' ) ) {
my($url) = $tok->[1]->{href};
#print "Prerun $url\n";
if ( $url =~ m'^/comments/[#/0-9]+$' ) { # ' (for emacs)
# Unget doesn't work for get_tag()???
#$parse->unget_token($tok);
$flag = 1;
last;
}
}
print STDERR "No Comments found!\n" unless $flag;
return undef unless $flag;
while ( $flag || ($tok = $parse->get_tag( 'a' ) ) ) {
my($url) = $tok->[1]->{href};
$flag = 0; # 'unget' doesn't work??
next if $url =~ m!^/user/!;
next if $url =~ m!^/story/!;
if ( $url !~ m'^/comments/[#/0-9]+$') { # ' (for emacs)
# This will be after all comments. Just loop
# through them, to be certain.
next;
}
# Data structure for comment:
my %c = (
url => $url,
title => $parse->get_text(),
);
# - - - Get modding info:
$tok = $parse->get_tag( 'a' );
$c{modurl} = $tok->[1]->{href};
my($modinfo)= $parse->get_text();
if ($modinfo =~ m!([^/]*)\s*/\s*(\d+)!) {
$c{mods}= 0 + $2;
$c{mod} = $1 if $c{mods};
$c{mod} = '' unless $c{mod};
} else {
warn "Failed to parse mod info '$modinfo;";
}
# - - - Get Reply info:
$tok = $parse->get_tag( 'b' );
$c{answers} = $parse->get_text() + 0; # No of answers
# - - - Get Posting date:
$tok = $parse->get_tag( '/a' );
my($date) = $parse->get_text();
$date =~ s/^\s*on\s*//i;
$date =~ s/\s*$//;
$c{date} = $date;
# - - - Get Story ref:
$tok = $parse->get_tag( 'a' );
$c{storyurl}= $tok->[1]->{href};
$c{storyname}= $parse->get_text();
# - - -
push @$results, \%c;
}
return $results;
}
|