Beefy Boxes and Bandwidth Generously Provided by pair Networks
Think about Loose Coupling
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/env perl # sf-digest.pl by Tim Rayner, 2002 # (timrayner@btinternet.com) # This code may be modified and distributed on # the same terms as the Perl source code. # # Script to periodically query a web page, extract # a table and identify new rows in that table. # New results are cached in $savefile; if a second # file ($archivefile) has not been modified # within the time $cacheperiod, the new table rows # (i.e. those in $savefile but not in # $archivefile) are mailed to the $mailto email address. # The results are then written out to # $archivefile to be omitted from future emails. # # The reason for this convoluted approach is that we # aim to capture table rows which may not # be present on the web page for very long (in some # cases, only a matter of hours). However, # we want to avoid sending emails more than once a # day (change $cacheperiod to alter this # behaviour). We also want to maintain a persistant # cache of results to overcome difficulties # connecting to the web page (originally # sourceforge.net, hardly a paragon of reliability). # # Use the command 'sf-digest.pl now' (as opposed to # simply 'sf-digest.pl') to override # the result cache mechanism and mail all the # current results now. # use strict; use warnings; use LWP::UserAgent; use HTTP::Request; use HTML::TableExtract; use MIME::Lite; ############################################# ############ User Config Section ############ ############################################# # # Address to send mail to. You will want to change this. # my $mailto='user@host'; # # Address to send mail from. Ensure that your SMTP # server or local MTA will accept this value. # my $mailfrom='user@host'; # # Subject line of the sent email: # my $mailsubject='SourceForge Update'; # # Table columns to extract (in order that they will # appear in the email). # N.B. Keep 'Request ID' as column 1; changing this # will break the script. # my $tableheaders=['Request ID','Date','Summary']; # # Lines at start and end of the email body text: # my $mailbodyhead="<html>Here are today's new SourceForge requests:<br> +<br>". "Follow the link to see the full description or to add comments to + an item. <br>". "You can use the monitor option to receive any comments added +to an item.<br>"; my $mailbodytail="<br>You can also go to the SourceForge GO Curator Re +quests Tracker<br>". " to see the complete list of submissions:<br>". qq!<a href="http://sourceforge.net/tracker/?atid=440764&group_id=3 +6855&func=browse">http://sourceforge.net/tracker/?at id=440764&group_id=36855&func=browse</a><br><br>!. "Signed,<br><br>the sf-digest daemon.</html>"; # # Save files used to store the last set of data downloaded. # It's likely that you will want to change these to a # set location (e.g. /home/user/.sf-digest-latest.txt). # my $savefile='/home/user/.sf-digest-latest.txt'; my $archivefile='/home/user/.sf-digest-archive.txt'; # # SMTP server via which to send mail. If undefined, # use local sendmail command. # my $smtp_server=''; # # Period for which results are cached prior to emailing # them (in seconds). Initially set to 1 day minus 15 # minutes (85500 seconds) # my $cacheperiod=(1*24*60*60)-(15*60); ##################################################### ### URL vars - You shouldn't need to touch these, ### ### unless SourceForge does. ### ##################################################### # # Web page to check (we will concatenate an offset # ($delta) later) # my $url = 'http://sourceforge.net/tracker/index.php?func=browse&group_ +id=36855&atid=440764&offset='; # # Offset between pages # my $delta = 50; # # Total limit on number of request IDs to download # from the web page. This is a safety feature, and # as such should not need changing. Change this if # the project ever balloons out of control :-) # my $limit=1000; # # HTML tags embedded in the email: # $idurl=$idurl_start.'<request ID>'.$idurl_end.<request ID>."</a>"; # (see below). # # - part one of requestID URL: my $idurl_start='<a href="http://sourceforge.net/tracker/index.php?fun +c=detail&aid='; # # - second part of requestID URL: my $idurl_end='&group_id=36855&atid=440764">'; ############################################# ########## End User Config Section ########## ############################################# sub gettable{ # download the table data, return a hashref # with column 1 as key and the other columns # as values, joined in a tab-delimited string my $url=shift(); my $delta=shift(); my $limit=shift(); my $tableheaders=shift(); my %results; # Here we $limit results to prevent infinite loop OFFSET: for (my $offset=0;$offset<$limit;$offset=$offset+$delta){ my $pageurl=$url.$offset; my $ua = LWP::UserAgent->new(timeout => 10); my $request = HTTP::Request->new('GET',$pageurl); my $response = $ua->request($request); if ($response->is_success){ my $te = new HTML::TableExtract( headers => $tableheaders ); $te->parse($response->content); last OFFSET unless $te->table_states; # No more table to pa +rse foreach my $ts ($te->table_states) { foreach my $rowref ($ts->rows) { my @row=@{$rowref}; # Strip out useless rows # (this is SourceForge-specific) next if (($row[0]=~ /^\S$/) || ($row[0]=~ /\<-- Prev +ious 50/)); # Format data and push into %results my $idurl=$idurl_start.$row[0].$idurl_end.$row[0]."< +/a>"; $results{$row[0]}= join("\t", $idurl, @row[1..$#row] +); } } } else { print "Error: ".$response->status_line."\n"; last OFFSET; } } return \%results; } sub readfile{ # Read in the old results file, # return old results hashref my $file=shift; my %oldresults; open (SAVEFILE,"<$file") or do { warn ("No save file; creating one named \'$file\'.\n"); return undef; }; while (my $line=<SAVEFILE>){ chomp $line; $line=~/(\w*)\t(.*)/; $oldresults{$1}=$2; } return \%oldresults; } sub writefile{ # write new results to save file my $file=shift(); my %results=%{shift()}; open (SAVEFILE,">$file") or die ("Could not open save file for wri +ting: $!\n"); foreach my $key (sort keys %results){ print SAVEFILE ("$key\t$results{$key}\n"); } } ############ ### Main ### ############ # Set the cache period to zero if we're called with # the 'now' directive (i.e. 'sf-digest now') if ($ARGV[0] && ($ARGV[0] eq 'now')){$cacheperiod = 0;} # Get old and new table data; overwrite old save # file with new data my $resref=&readfile($savefile); my %allresults=%{$resref} if $resref; my %newresults=%{&gettable($url,$delta,$limit,$tableheaders)}; # Merge the hashes to prevent false positive # upon SourceForge timeouts, # write everything out to the save file @allresults{keys %newresults} = values %newresults; &writefile($savefile,\%allresults); # We can either quit now or send the new message. # If the archive file is older than 1 day minus 5 minutes, # or if the archive file does not exist (i.e. first run), # we send the message. if ((! -f $archivefile) || (((stat($archivefile))[9]) <= (time-$cachep +eriod))){ # Read in the archive file my $archiveref=&readfile($archivefile); my %archiveresults=%{$archiveref} if $archiveref; # Construct main mail body text; #omit entries found in the archived table data my @mailbody; foreach my $id (sort keys %allresults){ # Strip out non-ascii characters #(certain mail reader programs prefer this) $allresults{$id}=~ s/[^[:ascii:]]//g; push (@mailbody, "$allresults{$id}\n") unless $archiveresults{ +$id}; } # Construct the rest of the mail and send it if (@mailbody){ # Don't send if there are no changes # Finish off the mail unshift (@mailbody, $mailbodyhead); push (@mailbody, $mailbodytail); my $body= join("<br>", @mailbody); my $mail=MIME::Lite->new( From => $mailfrom, To => $mailto, Subject => $mailsubject, Type => 'text/html', Encoding => 'quoted-printable', Data => $body, ); if ($smtp_server){ # Finally, send the mail $mail->send('smtp',$smtp_server); }else{ $mail->send(); } } # Merge all data and spew it into the archive @archiveresults{keys %allresults} = values %allresults; &writefile($archivefile,\%archiveresults); }

In reply to Web page digest mailer by tfrayner

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others cooling their heels in the Monastery: (1)
As of 2024-04-24 14:33 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found