#!/opt/perl-5.10.0/bin/perl use strict; use warnings; use feature ':5.10'; use File::Path 'mkpath'; BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 } my $mech = OpenCourseware::Mechanize->new; for my $pattern ( @ARGV ) { for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/ ) ) { $mech->get( $url ); my $path = $mech->course_filepath; mkpath( $path ); $mech->mirror_syllabus; $mech->mirror_readings; $mech->mirror_assignments; $mech->mirror_exams; $mech->mirror_videos; } } package OpenCourseware::Mechanize; use strict; use warnings; use feature ':5.10'; use WWW::Mechanize (); use HTML::Tidy (); use XML::LibXML (); use HTML::TableExtract (); use File::Slurp (); use LWP::Simple (); use constant { XML_PARSER => XML::LibXML->new, HTML_TIDY => HTML::Tidy->new }; BEGIN { @OpenCourseware::Mechanize::ISA = 'WWW::Mechanize'; XML_PARSER->no_network( 1 ); for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ], [readings => qr/^Readings\z/, 'readings.txt' ], ) { my ( $name, $text_regex, $file ) = @$_; eval <<"AAA"; sub mirror_$name { my ( \$self ) = \@_; my \$path = \$self->course_filepath; \$self->follow_link( text_regex => \$text_regex ); eval { File::Slurp::write_file( "\$path/\$file", \$self->content( format => 'text' ) ); }; my \$e = \$\@; \$self->back; die \$e if \$e; } AAA } for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEMS'], qr/^Assignments\z/], [exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) { my ( $name, $headers, $text_regex ) = @$_; eval <<"BBB"; sub mirror_$name { my ( \$self ) = \@_; my \$path = \$self->course_filepath; my \$te = HTML::TableExtract->new( headers => \$headers, keep_html => 1 ); \$self->follow_link( text_regex => \$text_regex ); eval { \$te->parse( \$self->content ); for ( \$te->tables ) { for ( \$_->rows ) { for ( grep { defined } \@\$_ ) { my ( \$url ) = m{"([^"]+)}; my ( \$file ) = \$url =~ m{/([^/]+)\\z}; LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$path/\$file" ); } } } }; my \$e = \$\@; \$self->back; die \$e if \$e; } BBB } } sub mirror_videos { my ( $self ) = @_; my $path = $self->course_filepath; $self->follow_link( text_regex => qr/^Video Lectures\z/ ); eval { my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self->content ) ); my $nth = 1; for my $tr ( $doc->findnodes( '//tr' ) ) { my ( $name ) = grep { /\S/ } map { $_->data } $tr->findnodes( 'td[ position() = 1 ]/text()' ); my @videos = grep { /\S/ } map { $_->value } $tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' ); if ( $videos[-1] ) { # pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/1599/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstorage.download.akamai.com/)(.+)}; $url = "http://ocw.mit.edu/ans$url"; my $file = sprintf "$path/%02d - $name.rm", $nth++; LWP::Simple::mirror( $url, $file ); } } }; my $e = $@; $self->back; die $e if $e; } sub course_filepath { my ( $self ) = @_; return join '/', grep { length } map { s[^[[:punct:]]+][]; s[[[:punct:]]+\z][]; $_; } $self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g; } sub find_all_courses { my ( $self, %p ) = @_; my $te = HTML::TableExtract->new( headers => [ 'Course Title' ], keep_html => 1 ); $mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm' ); $te->parse( $mech->content ); my @urls; for ( $te->tables ) { for ( $_->rows ) { when ($_ ~~ $p{title_regexp}) { push @urls, $_ ~~ /