#!/opt/perl-5.10.0/bin/perl
use strict;
use warnings;
use feature ':5.10';
use File::Path 'mkpath';
BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 }
my $mech = OpenCourseware::Mechanize->new;
for my $pattern ( @ARGV ) {
for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/
+) ) {
$mech->get( $url );
my $path = $mech->course_filepath;
mkpath( $path );
$mech->mirror_syllabus;
$mech->mirror_readings;
$mech->mirror_assignments;
$mech->mirror_exams;
$mech->mirror_videos;
}
}
package OpenCourseware::Mechanize;
use strict;
use warnings;
use feature ':5.10';
use WWW::Mechanize ();
use HTML::Tidy ();
use XML::LibXML ();
use HTML::TableExtract ();
use File::Slurp ();
use LWP::Simple ();
use constant {
XML_PARSER => XML::LibXML->new,
HTML_TIDY => HTML::Tidy->new
};
BEGIN {
@OpenCourseware::Mechanize::ISA = 'WWW::Mechanize';
XML_PARSER->no_network( 1 );
for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ],
[readings => qr/^Readings\z/, 'readings.txt' ], ) {
my ( $name, $text_regex, $file ) = @$_;
eval <<"AAA";
sub mirror_$name {
my ( \$self ) = \@_;
my \$path = \$self->course_filepath;
\$self->follow_link( text_regex => \$text_regex );
eval {
File::Slurp::write_file( "\$path/\$file", \$self->content(
+format => 'text' ) );
};
my \$e = \$\@;
\$self->back;
die \$e if \$e;
}
AAA
}
for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEM
+S'], qr/^Assignments\z/],
[exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) {
my ( $name, $headers, $text_regex ) = @$_;
eval <<"BBB";
sub mirror_$name {
my ( \$self ) = \@_;
my \$path = \$self->course_filepath;
my \$te = HTML::TableExtract->new( headers => \$headers,
keep_html => 1 );
\$self->follow_link( text_regex => \$text_regex );
eval {
\$te->parse( \$self->content );
for ( \$te->tables ) {
for ( \$_->rows ) {
for ( grep { defined } \@\$_ ) {
my ( \$url ) = m{"([^"]+)};
my ( \$file ) = \$url =~ m{/([^/]+)\\z};
LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$pat
+h/\$file" );
}
}
}
};
my \$e = \$\@;
\$self->back;
die \$e if \$e;
}
BBB
}
}
sub mirror_videos {
my ( $self ) = @_;
my $path = $self->course_filepath;
$self->follow_link( text_regex => qr/^Video Lectures\z/ );
eval {
my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self
+->content ) );
my $nth = 1;
for my $tr ( $doc->findnodes( '//tr' ) ) {
my ( $name ) =
grep { /\S/ }
map { $_->data }
$tr->findnodes( 'td[ position() = 1 ]/text()' );
my @videos =
grep { /\S/ }
map { $_->value }
$tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' );
if ( $videos[-1] ) {
# pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/15
+99/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi
my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstor
+age.download.akamai.com/)(.+)};
$url = "http://ocw.mit.edu/ans$url";
my $file = sprintf "$path/%02d - $name.rm", $nth++;
LWP::Simple::mirror( $url, $file );
}
}
};
my $e = $@;
$self->back;
die $e if $e;
}
sub course_filepath {
my ( $self ) = @_;
return
join '/',
grep { length }
map {
s[^[[:punct:]]+][];
s[[[:punct:]]+\z][];
$_;
}
$self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g;
}
sub find_all_courses {
my ( $self, %p ) = @_;
my $te = HTML::TableExtract->new( headers => [ 'Course Title' ],
keep_html => 1 );
$mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm
+' );
$te->parse( $mech->content );
my @urls;
for ( $te->tables ) {
for ( $_->rows ) {
when ($_ ~~ $p{title_regexp}) {
push @urls, $_ ~~ /<a href="([^"]+)/ ? "http://ocw.mit.edu$1" :
+();
}
}
}
return unique( @urls );
}
sub unique {
my %seen;
return
grep { not $seen{$_}++ }
@_;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.