Beefy Boxes and Bandwidth Generously Provided by pair Networks
Problems? Is your data what you think it is?
 
PerlMonks  

Download MIT OpenCourseware

by diotalevi (Canon)
on Nov 12, 2007 at 18:48 UTC ( #650340=sourcecode: print w/replies, xml ) Need Help??
Category:
Author/Contact Info Josh Jore
Description:

This is a convenient way to download some classes from MIT's OpenCourseware website.

#!/opt/perl-5.10.0/bin/perl
use strict;
use warnings;
use feature ':5.10';
use File::Path 'mkpath';
BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 }

my $mech = OpenCourseware::Mechanize->new;
for my $pattern ( @ARGV ) {
  for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/ 
+) ) {
    $mech->get( $url );

    my $path = $mech->course_filepath;
    mkpath( $path );
  
    $mech->mirror_syllabus;
    $mech->mirror_readings;
    $mech->mirror_assignments;
    $mech->mirror_exams;
    $mech->mirror_videos;
  }
}

package OpenCourseware::Mechanize;
use strict;
use warnings;
use feature ':5.10';
use WWW::Mechanize     ();
use HTML::Tidy         ();
use XML::LibXML        ();
use HTML::TableExtract ();
use File::Slurp        ();
use LWP::Simple        ();

use constant {
  XML_PARSER => XML::LibXML->new,
  HTML_TIDY  => HTML::Tidy->new
};

BEGIN {
  @OpenCourseware::Mechanize::ISA = 'WWW::Mechanize';

  XML_PARSER->no_network( 1 );

  for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ],
        [readings => qr/^Readings\z/, 'readings.txt' ], ) {
    my ( $name, $text_regex, $file ) = @$_;
    eval <<"AAA";
       sub mirror_$name {
         my ( \$self ) = \@_;
         my \$path = \$self->course_filepath;

         \$self->follow_link( text_regex => \$text_regex );
         eval {
           File::Slurp::write_file( "\$path/\$file", \$self->content( 
+format => 'text' ) );
         };
         my \$e = \$\@;
         \$self->back;
         die \$e if \$e;
       }
AAA
  }

  for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEM
+S'], qr/^Assignments\z/],
        [exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) {
    my ( $name, $headers, $text_regex ) = @$_;

    eval <<"BBB";
      sub mirror_$name {
        my ( \$self ) = \@_;
        my \$path = \$self->course_filepath;
        my \$te = HTML::TableExtract->new( headers => \$headers,
                                           keep_html => 1 );

        \$self->follow_link( text_regex => \$text_regex );
        eval {
          \$te->parse( \$self->content );
          for ( \$te->tables ) {
            for ( \$_->rows ) {
              for ( grep { defined } \@\$_ ) {
                my ( \$url ) = m{"([^"]+)};
                my ( \$file ) = \$url =~ m{/([^/]+)\\z};

                LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$pat
+h/\$file" );
              }
            }
          }
        };
        my \$e = \$\@;
        \$self->back;
        die \$e if \$e;
      }
BBB
  }
}

sub mirror_videos {
  my ( $self ) = @_;
  my $path = $self->course_filepath;
  $self->follow_link( text_regex => qr/^Video Lectures\z/ );
  eval {
    my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self
+->content ) );
    my $nth = 1;
    for my $tr ( $doc->findnodes( '//tr' ) ) {
      my ( $name ) =
        grep { /\S/ }
        map { $_->data }
        $tr->findnodes( 'td[ position() = 1 ]/text()' );

      my @videos =
        grep { /\S/ }
        map { $_->value }
        $tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' );

      if ( $videos[-1] ) {
        # pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/15
+99/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi
        my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstor
+age.download.akamai.com/)(.+)};
        $url = "http://ocw.mit.edu/ans$url";
        my $file = sprintf "$path/%02d - $name.rm", $nth++;
        LWP::Simple::mirror( $url, $file );
      }
    }
  };
  my $e = $@;
  $self->back;
  die $e if $e;
}

sub course_filepath {
  my ( $self ) = @_;
  return
    join '/',
    grep { length }
    map {
      s[^[[:punct:]]+][];
      s[[[:punct:]]+\z][];
      $_;
    }
    $self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g;
}

sub find_all_courses {
  my ( $self, %p ) = @_;

  my $te = HTML::TableExtract->new( headers => [ 'Course Title' ],
                                    keep_html => 1 );

  $mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm
+' );
  $te->parse( $mech->content );

  my @urls;
  for ( $te->tables ) {
    for ( $_->rows ) {
      when ($_ ~~ $p{title_regexp}) {
      push @urls, $_ ~~ /<a href="([^"]+)/ ? "http://ocw.mit.edu$1" : 
+();
      }
    }
  }

  return unique( @urls );
}

sub unique {
  my %seen;
  return
    grep { not $seen{$_}++ }
    @_;
}

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://650340]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others about the Monastery: (4)
As of 2020-10-27 06:39 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    My favourite web site is:












    Results (256 votes). Check out past polls.

    Notices?