comment on

#!/opt/perl-5.10.0/bin/perl
use strict;
use warnings;
use feature ':5.10';
use File::Path 'mkpath';
BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 }

my $mech = OpenCourseware::Mechanize->new;
for my $pattern ( @ARGV ) {
  for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/ 
+) ) {
    $mech->get( $url );

    my $path = $mech->course_filepath;
    mkpath( $path );
  
    $mech->mirror_syllabus;
    $mech->mirror_readings;
    $mech->mirror_assignments;
    $mech->mirror_exams;
    $mech->mirror_videos;
  }
}

package OpenCourseware::Mechanize;
use strict;
use warnings;
use feature ':5.10';
use WWW::Mechanize     ();
use HTML::Tidy         ();
use XML::LibXML        ();
use HTML::TableExtract ();
use File::Slurp        ();
use LWP::Simple        ();

use constant {
  XML_PARSER => XML::LibXML->new,
  HTML_TIDY  => HTML::Tidy->new
};

BEGIN {
  @OpenCourseware::Mechanize::ISA = 'WWW::Mechanize';

  XML_PARSER->no_network( 1 );

  for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ],
        [readings => qr/^Readings\z/, 'readings.txt' ], ) {
    my ( $name, $text_regex, $file ) = @$_;
    eval <<"AAA";
       sub mirror_$name {
         my ( \$self ) = \@_;
         my \$path = \$self->course_filepath;

         \$self->follow_link( text_regex => \$text_regex );
         eval {
           File::Slurp::write_file( "\$path/\$file", \$self->content( 
+format => 'text' ) );
         };
         my \$e = \$\@;
         \$self->back;
         die \$e if \$e;
       }
AAA
  }

  for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEM
+S'], qr/^Assignments\z/],
        [exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) {
    my ( $name, $headers, $text_regex ) = @$_;

    eval <<"BBB";
      sub mirror_$name {
        my ( \$self ) = \@_;
        my \$path = \$self->course_filepath;
        my \$te = HTML::TableExtract->new( headers => \$headers,
                                           keep_html => 1 );

        \$self->follow_link( text_regex => \$text_regex );
        eval {
          \$te->parse( \$self->content );
          for ( \$te->tables ) {
            for ( \$_->rows ) {
              for ( grep { defined } \@\$_ ) {
                my ( \$url ) = m{"([^"]+)};
                my ( \$file ) = \$url =~ m{/([^/]+)\\z};

                LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$pat
+h/\$file" );
              }
            }
          }
        };
        my \$e = \$\@;
        \$self->back;
        die \$e if \$e;
      }
BBB
  }
}

sub mirror_videos {
  my ( $self ) = @_;
  my $path = $self->course_filepath;
  $self->follow_link( text_regex => qr/^Video Lectures\z/ );
  eval {
    my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self
+->content ) );
    my $nth = 1;
    for my $tr ( $doc->findnodes( '//tr' ) ) {
      my ( $name ) =
        grep { /\S/ }
        map { $_->data }
        $tr->findnodes( 'td[ position() = 1 ]/text()' );

      my @videos =
        grep { /\S/ }
        map { $_->value }
        $tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' );

      if ( $videos[-1] ) {
        # pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/15
+99/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi
        my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstor
+age.download.akamai.com/)(.+)};
        $url = "http://ocw.mit.edu/ans$url";
        my $file = sprintf "$path/%02d - $name.rm", $nth++;
        LWP::Simple::mirror( $url, $file );
      }
    }
  };
  my $e = $@;
  $self->back;
  die $e if $e;
}

sub course_filepath {
  my ( $self ) = @_;
  return
    join '/',
    grep { length }
    map {
      s[^[[:punct:]]+][];
      s[[[:punct:]]+\z][];
      $_;
    }
    $self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g;
}

sub find_all_courses {
  my ( $self, %p ) = @_;

  my $te = HTML::TableExtract->new( headers => [ 'Course Title' ],
                                    keep_html => 1 );

  $mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm
+' );
  $te->parse( $mech->content );

  my @urls;
  for ( $te->tables ) {
    for ( $_->rows ) {
      when ($_ ~~ $p{title_regexp}) {
      push @urls, $_ ~~ /<a href="([^"]+)/ ? "http://ocw.mit.edu$1" : 
+();
      }
    }
  }

  return unique( @urls );
}

sub unique {
  my %seen;
  return
    grep { not $seen{$_}++ }
    @_;
}
[download]

In reply to Download MIT OpenCourseware by diotalevi

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


Don't ask to ask, just ask
	PerlMonks