#!/usr/bin/perl use strict; use warnings; use constant USER => 15; use Getopt::Std; use HTML::TableContentParser; use HTML::TokeParser::Simple; use URI; use WWW::Mechanize; use XML::Simple; my ($monk, $opt, $tut, $listed) = ( {}, {}, [], {} ); Get_Args( $opt ); my $mech = WWW::Mechanize->new( autocheck => 1 ); Get_Tutorials(); Get_Listed(); Print_Report(); sub Get_Args { my $opt = shift; my $Usage = qq{Usage: $0 [options] -h : This help message. -b : Base URL - default: http://www.perlmonks.org/ } . "\n"; getopts( 'hb:' , $opt ) or die $Usage; die $Usage if $opt->{h}; $opt->{b} ||= 'http://www.perlmonks.org/'; } sub Get_Listed { $mech->get( URL( 'Tutorials' ) ); my %link = map { lc $_->url() => undef } $mech->links(); delete $link{'/index.pl?replies=1&node_id=954&displaytype=print'}; for my $url ( keys %link ) { next if exists $monk->{$url} || $url =~ /^#/; if ( $url =~ /^http/ ) { push @{ $listed->{offsite} }, $url; next; } if ( ! Lookup( $url ) ) { push @{ $listed->{non_tutorials} }, $url if ! Monk( $url ); } } } sub Get_Tutorials { $mech->get( URL( 'Super Search' ) ); $mech->field( 'xa', '1' ); # Exclude Authors $mech->field( 'a', 'NodeReaper' ); # Author NodeReaper $mech->tick( 'Tu', '1' ); # Select Tutorials $mech->field( 're', 'N' ); # Exclude Replies $mech->click_button( name => 'go' ); # Submit Form while ( $mech->content !~ /Finished searching database/ ) { Process_Table(); $mech->click_button( name => 'nx' ); } } sub Lookup { my $url = shift; for ( 0 .. $#$tut ) { next if ! defined $tut->[$_]; if ( $url eq $tut->[$_]{id} || $url eq $tut->[$_]{name} ) { delete $tut->[$_]; return 1; } } return 0; } sub Monk { $mech->get( $opt->{b} . $_[0] . '&displaytype=xml' ); my $node = XML::Simple->new()->XMLin( $mech->content() ); return 1 if exists $node->{type}{id} && $node->{type}{id} == USER; return 0; } sub Print_Report { print "\t\t ---- Non-Tutorials linked as Tutorials ----\n"; print "$_\n" for @{ $listed->{non_tutorials} }; print "\t\t --------- Off-Site Tutorials ---------\n"; print "$_\n" for @{ $listed->{offsite} }; print "\t\t ---- Missing Tutorials (not linked) ----\n"; print "$_->{id}\n" for grep defined , @$tut; } sub Process_Link { my ($link, $type) = @_; my $p = HTML::TokeParser::Simple->new( \$link->{data} ); my ($node, $label); while ( my $token = $p->get_token ) { last if $token->is_end_tag; if ( $token->is_start_tag( 'a' ) ) { $node = lc $token->return_attr( 'href' ); next; } $label = lc URI->new( '/index.pl?node=' . $token->as_is )->as_string if $token->is_text; } die "Something went terribly wrong" if ! $node || ! $label; if ( $type eq 'author' ) { $monk->{ $label } = undef; $monk->{ $node } = undef; } else { push @{ $tut } , { id => $node, name => $label }; } } sub Process_Table { my $table = HTML::TableContentParser->new()->parse( $mech->content() ); for my $row ( @{ $table->[0]{rows} } ) { Process_Link( $row->{cells}[1], 'author' ); Process_Link( $row->{cells}[2], 'tutorial' ); } } sub URL { return URI->new( $opt->{b} . 'index.pl?node=' . $_[0] . '&displaytype=print' )->as_string }