Category: | Web Stuff |
Author/Contact Info | /msg LTjake |
Description: | This script grabs the contents of CAKE's news page, parses the entries and outputs it as either an RSS feed or Atom feed. |
package WWW::CAKEmusic::News; use strict; # Debug use warnings; # use diagnostics; # use LWP::Debug qw( + ); # use Data::Dumper; use Carp; use WWW::Mechanize; use HTML::TokeParser::Simple; use constant URL => 'http://www.cakemusic.com/news.html'; use constant FEED_TITLE => 'CAKE News'; use constant FEED_DESC => 'Recent news from the band, CAKE.'; # Could've used a Date module... my %months = ( jan => '01', feb => '02', mar => '03', apr => '04', may => '05', jun => '06', jul => '07', aug => '08', sep => '09', oct => '10', nov => '11', dec => '12' ); sub new { my $class = shift; my $self = {}; bless $self, $class; $self->fetch; return $self; } # Fetch the news page sub fetch { my $self = shift; my $agent = WWW::Mechanize->new; $agent->get( URL ); croak 'Error fetching ' . URL . ': ' . $agent->response->status_li +ne unless $agent->success; $self->{ _raw } = $agent->content; $self->parse; } # Parse the data sub parse { my $self = shift; my $capture = 0; my $content; # remove all irrelevant parts of the content foreach( split( /\n/, $self->{ _raw } ) ) { $capture++ if /Newsflash for/ and not $capture; last if /<DL>/; next if not $capture or /^$/; s/\s+/ /g; $content .= $_; } my $parser = HTML::TokeParser::Simple->new( \$content ); # this will make sure text tokens won't be split $parser->unbroken_text(1); my @items; while( my $token = $parser->get_token ) { $_ = $token->as_is; next unless /\S/; if( $token->is_text ) { # new day of news if( /Newsflash for/ ) { push @items, { title => $_, descriptions => [] }; s/^Newsflash for (Week of)?\s*//i; /^(...)\S* (\d+). .*(.{4})$/; $items[ -1 ]->{ date } = join( '-', $3, $months{ lc( $ +1 ) }, sprintf( '%02d', $2 ) ); } # just plain text else { $items[ -1 ]->{ descriptions }->[ -1 ] .= $_; } } # each news item is a list-item elsif( $token->is_start_tag( 'li' ) ) { push @{ $items[ -1 ]->{ descriptions } }, ''; } } $self->{ items } = \@items; } # use XML::RSS to make an RSS feed sub as_rss { my $self = shift; require XML::RSS; my $feed = XML::RSS->new; $feed->channel( title => FEED_TITLE, link => URL, description => FEED_DESC ); for my $item ( @{ $self->{ items } } ) { for my $index ( 1..scalar @{ $item->{ descriptions } } ) { $feed->add_item( title => $item->{ title }, link => URL . '#' . $item->{ date } . '-' . ( $ +index ), description => $item->{ descriptions }->[ $index - 1 ] +, dc => { date => $item->{ date } } ); } } return $feed->as_string; } # use XML::Atom to make an Atom feed sub as_atom { my $self = shift; require XML::Atom::Feed; require XML::Atom::Entry; my $feed = XML::Atom::Feed->new; $feed->title( FEED_TITLE ); for my $item ( @{ $self->{ items } } ) { for my $desc ( @{ $item->{ descriptions } } ) { my $entry = XML::Atom::Entry->new; $entry->title( $item->{ title } ); $entry->content( $desc ); $feed->add_entry( $entry ); } } return $feed->as_xml; } package main; my $news = WWW::CAKEmusic::News->new; my $output = "as_$ARGV[ 0 ]"; print $news->$output; =head1 NAME cakenews - grab the latest CAKE news in Atom or RSS format =head1 SYNOPSIS cakenews.pl rss > cake.rss cakenews.pl atom > cake.atom =head1 DESCRIPTION This script grabs the contents of CAKE's news page, parses the entries + and outputs it as either an RSS feed or Atom feed. =head1 NOTICE Please do not abuse CAKE's server with this script. Consider using WWW::Mechanize::Cached if you want to use this on a regular basis. |
Back to
Code Catacombs