Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things


by BerntB (Deacon)
on Nov 13, 2006 at 15:19 UTC ( #583725=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info Bernt 'Bug' Budde
Description: Screen scrape

Downloads all your posted comments to a directory. Generates a yaml file with definitions of all the comments, suitable for other scripts to use for parsing the data.

Can be the first of a number of scripts to e.g. find mod-bombers, etc.

Tab size 4.



# This has the same license as Perl.

use strict;
use warnings;

# Probably needs to install these modules.

# See the cpan command or your local variant like apt-get
# for Debian based systems and PPM for Win. All these should
# be in Debian Testing/Ubuntu.

use HTML::TokeParser;            # In HTML::Parser.
use WWW::Mechanize;
use Getopt::Long;
use Date::Manip;
use YAML;

# ------------------------------------------------
our $VERSION = '0.01';

use constant TIME_BETWEEN_PAGELOADS    => 4;
use constant SCOOP_WEB_SITE            => "";

# ------------------------------------------------
# Help text:

my $usage = "
This program is used to screen scrape '' for your comments

You own your comments on K5 but not other people's comments, so this
just supports getting what you have written yourself. It d/l the full
html text, so do NOT use 'nested' for 'Comment display mode'! It hurts
usability for grep-ping (searching) your saved comments, anyway.

    Use: perl OPTIONS ...

    Options are:

  -u username   Specify the username at kuro5hin.
  -p password   Password so the program can log in.
  --dump dir    Save all your comments in dir.

  -n   no       Just get the 'no' last comments.
  -log          Log what's done on STD ERR. (Simple debug.)
 --evil         Usually, the program waits a bit between every read
                from K5. Not with this flag. (-: If you use it, I'll
                laugh if rusty cancels your account. :-)
 --help         Shows this text.

'-p' and '--dump' are mandatory.

If no user name is specified, the login name will be used (-: that
should work for rusty, at least :-).

    Examples of use:

Dumps html of ALL posts to K5 in dir 'foo'.
  perl -u bar -p XX --dump 'foo' --log
Gets html of the last 200 posts to K5 in a dir.
  perl -u bar -p XX --dump foo --log -n 200

    To Do:

- Utility to parse out the html to textfiles, so they are easier to
  grep (search, for win people). 3vial.
- Utility that d/l new things you've written (and when you get comment
  and mods on old stuff) and updates without d/l everything!
- Utility that d/l modding and finds who modbombs you (Hello 'V' and
- Utility that parse data to find sniper comments added much later, to
  get the last word (Hello, 'A Bore'!)
- Remove warning when d/l comment list. :-)


- The generated yaml file contains all the info an updater program wou
  need -- and it is both human readable and not Perl specific. Have fu
- Not tested on Win; sorry, I can only run Unix stuff at home.
- Don't use the same dir for multiple downloads. An information file
  will be overwritten.
- This will certainly needs modification to work with other Scoop site
- I should have written an extension to Scoop and published instead,
  which rusty could have installed. More efficient.


# ------------------------------------------------
# Parse parameters:

my($username, $password, $dir,
   $max_comments, $log, $evil,  $help);

my($rslt)    =GetOptions('u|user:s'    => \$username,
                        'p|pwd=s'    => \$password,
                        'h|help'    => \$help,
                        evil        => \$evil,
                        'n|maxno:i'    => \$max_comments,
                        'dump:s'    => \$dir,
                        log            => \$log,

die "Bad parameters?!$usage"                unless $rslt;

if ($help) {
    print $usage;
    exit 0;

die "Password is mandatory. Try '--help'"    unless $password;
$username    = `whoami`                        unless $username;
die "Mandatory with '--dump'"                unless $dir;

# Try to create dir before slow && painful d/l:
chop $dir                                    if $dir =~ m!/$!;
mkdir $dir                                    unless -d $dir;
die "'--dump' can't make dir '$dir'"        unless -d $dir;

# ------------------------------------------------
# Init and try to login at the site:

my $mech        = WWW::Mechanize->new();
$mech->stack_depth(2);                # Don't use so much RAM

$mech->get(SCOOP_WEB_SITE . "?uname=$username&pass=$password");

die "Failed to login with user $username"
            if    $mech->content() !~ /Logout from all locations/ &&
                $mech->content() !~ /Logout $username/;

print STDERR "Logged in\n"                    if $log;
sleep TIME_BETWEEN_PAGELOADS                unless $evil;

# ------------------------------------------------
# Read down Comment list:

my($cmts) = [];                        # Store comment info here

load_from_comment_list($mech, $username, $cmts,
                       $max_comments, $log, $evil);

# Note for you guys with multiple accounts. You could just add
# calls here.

# ------------------------------------------------
# Get all comments.

# (Already checked so there is a dir $dir.)

# (Start with earliest days, so numbering of comments a given day
# is always the same even if adds comments the same day.)
foreach my $c (reverse @$cmts) {
    $mech->get(SCOOP_WEB_SITE . $c->{url});

    # - - - Create file name:
    my($date)    = ParseDate($c->{date});
    my($fname)    = UnixDate($date, "%y-%m-%d");
    my($no)        = 1;
    while (-f "$dir/$fname-$no.html") {
    $c->{fname}    = "$fname-$no.html";
    $fname        = "$dir/$fname-$no.html";

    # - - - Log, etc:
    print STDERR "Got $fname " . $c->{url} ."\n"    if $log;
    sleep TIME_BETWEEN_PAGELOADS            unless $evil;

    # - - - Prepare data for storing:
    my($html)    = $mech->content();

    # Add html comment with easily parsed known info about comment:
    my($text) = sprintf    "mod %s,%s. repl %s date %s, url '%s'"
                    .    " story %s title '%s'.",
                        $c->{mods}, $c->{mod}, $c->{answers},
                        $c->{date}, $c->{url},
                        $c->{storyurl}, $c->{title};
    $text =~ s/-->/-->/g;
    die "Couldn't find '</head>' in " . $c->{url}
                    unless $html =~ s@</HEAD@<!-- $text --> </HEAD@i;

    # - - - Store:
    if (open(my $fh, '>', $fname)) {
        print $fh $html;
        close $fh;
    } else {
        warn "Couldn't save to file '$fname'";

# ------------------------------------------------
# Dump YAML Data description (important)

# This yaml file makes an update script trivial to write.

YAML::DumpFile("$dir/comments.yaml", $cmts);

# ------------------------------------------------
# Dump comment list as simple html.

# Make utility html list with references to all comments.

print STDERR "Save list to $dir/comment-list.html\n"    if $log;
if (open(my $fh, '>', "$dir/comment-list.html")) {
    print $fh "
<html><head><TITLE>Comments By $username</title><body>
    foreach my $c (@$cmts) {
        print $fh '<p>';
        print $fh '<a href="', SCOOP_WEB_SITE, $c->{url}, '">',
                  $c->{title}, '</a> on ', $c->{date};
        print $fh "<br />\n";
        my($mod) = $c->{mod} ? $c->{mod} : '';
        printf $fh 'Rated by <a href="%s">%s</a> for %s, %s replies.',
                    SCOOP_WEB_SITE . $c->{modurl}, $c->{mods},
                    $mod, $c->{answers};
        printf $fh ' Story <a href="%s">%s</a>',
                    SCOOP_WEB_SITE . $c->{storyurl}, $c->{storyname};
        print $fh "</p>\n";
    print $fh "\n</body></html>\n";
    close $fh;
} else {
    warn "ERR! Couldn't save to '$dir/comment-list.html'!?\n";

# ------------------------------------------------
# Get Comment list from Scoop site:

sub load_from_comment_list {
    my($mech, $username, $cmts, $max_comments, $log, $evil) = @_;

    my($cmts_url)    = SCOOP_WEB_SITE . "/user/$username/comments";

    # Suitable Tst; user with quite few posts:
    #$cmts_url        = SCOOP_WEB_SITE . "/user/Zombie%20Abu%20Musab%2

    # - - - Read first page of comment list:
    # XXX Err test for success here!
    print STDERR "Got 1st comment list page\n"    if $log;
    sleep TIME_BETWEEN_PAGELOADS                unless $evil;

    # - - - Loop over comment list pages and get comment refs:

    while (1) {
        my($no_cmts)= scalar(@$cmts);
        last        if $max_comments && $max_comments < $no_cmts;

        my($content)= $mech->content();
        my($cmtinfo)= parse_cmtlist(\$content, $cmts);
        my($c_now)    = scalar(@$cmts);
        print STDERR "Has read $c_now cmts\n"    if $log;

        # - - - Go to next page, if any for comment list:

        # (Quite new in WWW::Mechanize API, so don't use it.)
        #my($form)    = $mech->form_with_fields('next');

        # This will give a warning, but sigh...
        for(my $i = 0; $i < 4; $i++) {
            $form    = $mech->form_number($i);
            last                    unless $form;
            last                    if $form->find_input('next');
        last                        unless $form; # No more data

        $mech->select('count', '50');
        $mech->click_button(name => 'next');
        print STDERR "Got next comment page\n"    if $log;
        sleep TIME_BETWEEN_PAGELOADS        unless $evil;

# ------------------------------------------------
# Get Comment info from Comment list page

sub parse_cmtlist {
    my($data, $results) = @_;

    my $parse = HTML::TokeParser->new($data);

    # Skip everything before comment list:
    my($flag, $tok);
    while ( $tok    = $parse->get_tag( 'a' ) ) {
        my($url)    = $tok->[1]->{href};
        #print "Prerun $url\n";
        if ( $url =~ m'^/comments/[#/0-9]+$' ) { # ' (for emacs)
            # Unget doesn't work for get_tag()???
            $flag    = 1;

    print STDERR "No Comments found!\n"    unless $flag;
    return undef                        unless $flag;

    while ( $flag || ($tok = $parse->get_tag( 'a' ) ) ) {

        my($url)    = $tok->[1]->{href};
        $flag        = 0;        # 'unget' doesn't work??

        next                            if $url =~ m!^/user/!;
        next                            if $url =~ m!^/story/!;
        if ( $url !~ m'^/comments/[#/0-9]+$') { # ' (for emacs)
            # This will be after all comments. Just loop
            # through them, to be certain.

        # Data structure for comment:
        my %c = (
            url        => $url,
            title    => $parse->get_text(),

        # - - - Get modding info:
        $tok = $parse->get_tag( 'a' );
        $c{modurl}    = $tok->[1]->{href};
        my($modinfo)= $parse->get_text();
        if ($modinfo =~ m!([^/]*)\s*/\s*(\d+)!) {
            $c{mods}= 0 + $2;
            $c{mod}    = $1                if $c{mods};
            $c{mod}    = ''                unless $c{mod};
        } else {
            warn "Failed to parse mod info '$modinfo;";

        # - - - Get Reply info:
        $tok = $parse->get_tag( 'b' );
        $c{answers}    = $parse->get_text() + 0; # No of answers

        # - - - Get Posting date:
        $tok = $parse->get_tag( '/a' );
        my($date)    = $parse->get_text();
        $date =~ s/^\s*on\s*//i;
        $date =~ s/\s*$//;
        $c{date}    = $date;

        # - - - Get Story ref:
        $tok = $parse->get_tag( 'a' );
        $c{storyurl}= $tok->[1]->{href};
        $c{storyname}= $parse->get_text();

        # - - -
        push @$results, \%c;
    return $results;

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://583725]
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others examining the Monastery: (3)
As of 2020-09-27 09:28 GMT
Find Nodes?
    Voting Booth?
    If at first I donít succeed, I Ö

    Results (142 votes). Check out past polls.