#!/usr/bin/env perl # sf-digest.pl by Tim Rayner, 2002 # (timrayner@btinternet.com) # This code may be modified and distributed on # the same terms as the Perl source code. # # Script to periodically query a web page, extract # a table and identify new rows in that table. # New results are cached in $savefile; if a second # file ($archivefile) has not been modified # within the time $cacheperiod, the new table rows # (i.e. those in $savefile but not in # $archivefile) are mailed to the $mailto email address. # The results are then written out to # $archivefile to be omitted from future emails. # # The reason for this convoluted approach is that we # aim to capture table rows which may not # be present on the web page for very long (in some # cases, only a matter of hours). However, # we want to avoid sending emails more than once a # day (change $cacheperiod to alter this # behaviour). We also want to maintain a persistant # cache of results to overcome difficulties # connecting to the web page (originally # sourceforge.net, hardly a paragon of reliability). # # Use the command 'sf-digest.pl now' (as opposed to # simply 'sf-digest.pl') to override # the result cache mechanism and mail all the # current results now. # use strict; use warnings; use LWP::UserAgent; use HTTP::Request; use HTML::TableExtract; use MIME::Lite; ############################################# ############ User Config Section ############ ############################################# # # Address to send mail to. You will want to change this. # my $mailto='user@host'; # # Address to send mail from. Ensure that your SMTP # server or local MTA will accept this value. # my $mailfrom='user@host'; # # Subject line of the sent email: # my $mailsubject='SourceForge Update'; # # Table columns to extract (in order that they will # appear in the email). # N.B. Keep 'Request ID' as column 1; changing this # will break the script. # my $tableheaders=['Request ID','Date','Summary']; # # Lines at start and end of the email body text: # my $mailbodyhead="Here are today's new SourceForge requests:

". "Follow the link to see the full description or to add comments to an item.
". "You can use the monitor option to receive any comments added to an item.
"; my $mailbodytail="
You can also go to the SourceForge GO Curator Requests Tracker
". " to see the complete list of submissions:
". qq!http://sourceforge.net/tracker/?at id=440764&group_id=36855&func=browse

!. "Signed,

the sf-digest daemon."; # # Save files used to store the last set of data downloaded. # It's likely that you will want to change these to a # set location (e.g. /home/user/.sf-digest-latest.txt). # my $savefile='/home/user/.sf-digest-latest.txt'; my $archivefile='/home/user/.sf-digest-archive.txt'; # # SMTP server via which to send mail. If undefined, # use local sendmail command. # my $smtp_server=''; # # Period for which results are cached prior to emailing # them (in seconds). Initially set to 1 day minus 15 # minutes (85500 seconds) # my $cacheperiod=(1*24*60*60)-(15*60); ##################################################### ### URL vars - You shouldn't need to touch these, ### ### unless SourceForge does. ### ##################################################### # # Web page to check (we will concatenate an offset # ($delta) later) # my $url = 'http://sourceforge.net/tracker/index.php?func=browse&group_id=36855&atid=440764&offset='; # # Offset between pages # my $delta = 50; # # Total limit on number of request IDs to download # from the web page. This is a safety feature, and # as such should not need changing. Change this if # the project ever balloons out of control :-) # my $limit=1000; # # HTML tags embedded in the email: # $idurl=$idurl_start.''.$idurl_end..""; # (see below). # # - part one of requestID URL: my $idurl_start=''; ############################################# ########## End User Config Section ########## ############################################# sub gettable{ # download the table data, return a hashref # with column 1 as key and the other columns # as values, joined in a tab-delimited string my $url=shift(); my $delta=shift(); my $limit=shift(); my $tableheaders=shift(); my %results; # Here we $limit results to prevent infinite loop OFFSET: for (my $offset=0;$offset<$limit;$offset=$offset+$delta){ my $pageurl=$url.$offset; my $ua = LWP::UserAgent->new(timeout => 10); my $request = HTTP::Request->new('GET',$pageurl); my $response = $ua->request($request); if ($response->is_success){ my $te = new HTML::TableExtract( headers => $tableheaders ); $te->parse($response->content); last OFFSET unless $te->table_states; # No more table to parse foreach my $ts ($te->table_states) { foreach my $rowref ($ts->rows) { my @row=@{$rowref}; # Strip out useless rows # (this is SourceForge-specific) next if (($row[0]=~ /^\S$/) || ($row[0]=~ /\<-- Previous 50/)); # Format data and push into %results my $idurl=$idurl_start.$row[0].$idurl_end.$row[0].""; $results{$row[0]}= join("\t", $idurl, @row[1..$#row]); } } } else { print "Error: ".$response->status_line."\n"; last OFFSET; } } return \%results; } sub readfile{ # Read in the old results file, # return old results hashref my $file=shift; my %oldresults; open (SAVEFILE,"<$file") or do { warn ("No save file; creating one named \'$file\'.\n"); return undef; }; while (my $line=){ chomp $line; $line=~/(\w*)\t(.*)/; $oldresults{$1}=$2; } return \%oldresults; } sub writefile{ # write new results to save file my $file=shift(); my %results=%{shift()}; open (SAVEFILE,">$file") or die ("Could not open save file for writing: $!\n"); foreach my $key (sort keys %results){ print SAVEFILE ("$key\t$results{$key}\n"); } } ############ ### Main ### ############ # Set the cache period to zero if we're called with # the 'now' directive (i.e. 'sf-digest now') if ($ARGV[0] && ($ARGV[0] eq 'now')){$cacheperiod = 0;} # Get old and new table data; overwrite old save # file with new data my $resref=&readfile($savefile); my %allresults=%{$resref} if $resref; my %newresults=%{&gettable($url,$delta,$limit,$tableheaders)}; # Merge the hashes to prevent false positive # upon SourceForge timeouts, # write everything out to the save file @allresults{keys %newresults} = values %newresults; &writefile($savefile,\%allresults); # We can either quit now or send the new message. # If the archive file is older than 1 day minus 5 minutes, # or if the archive file does not exist (i.e. first run), # we send the message. if ((! -f $archivefile) || (((stat($archivefile))[9]) <= (time-$cacheperiod))){ # Read in the archive file my $archiveref=&readfile($archivefile); my %archiveresults=%{$archiveref} if $archiveref; # Construct main mail body text; #omit entries found in the archived table data my @mailbody; foreach my $id (sort keys %allresults){ # Strip out non-ascii characters #(certain mail reader programs prefer this) $allresults{$id}=~ s/[^[:ascii:]]//g; push (@mailbody, "$allresults{$id}\n") unless $archiveresults{$id}; } # Construct the rest of the mail and send it if (@mailbody){ # Don't send if there are no changes # Finish off the mail unshift (@mailbody, $mailbodyhead); push (@mailbody, $mailbodytail); my $body= join("
", @mailbody); my $mail=MIME::Lite->new( From => $mailfrom, To => $mailto, Subject => $mailsubject, Type => 'text/html', Encoding => 'quoted-printable', Data => $body, ); if ($smtp_server){ # Finally, send the mail $mail->send('smtp',$smtp_server); }else{ $mail->send(); } } # Merge all data and spew it into the archive @archiveresults{keys %allresults} = values %allresults; &writefile($archivefile,\%archiveresults); }