comment on

#!/usr/bin/perl -w
use strict;
#--
#-- Script:    SETIStat.pl
#-- Purpose:   Displays the information for the SETI@Home PM group
#--
#-- Author:    Robert(Bob) Smith
#-- Date:      December 11, 2001
#--
#-- Wish List: Add error handling if the requested HTML page is not re
+trieved
#--
#-- Rev Hist:  00.00.a 2001-12-11 rws  Initial version
#--
#-- Notes:     This script was created as a learning example for HTML:
+:TableExtract
#--

#-- Use modules
use HTML::TableExtract;
use HTTP::Request::Common;
use LWP::UserAgent;


#-- Define constants
use constant VERSION                   => '00.00.a';

use constant FIELD_DELIM               => ',';
use constant SETI_URL                  => 'http://setiathome.ssl.berke
+ley.edu/stats/team/team_86606.html';

use constant ERR                       =>
  {
    'ok'                               => 0,
  };


#-- Define variables
my $gExtractedTable;                   #  Table extracted from the HTM
+L
my $gHTMLPage;                         #  Retrieved HTML page
my $gName;                             #  Member's name
my $gRank;                             #  Member's ranking
my $gRow;                              #  Pointer to rows in extracted
+ tables
my $gTable;                            #  Pointer to extracted tables
my $gUserAgent;                        #  LWP::UserAgent


#-- Retrieve the HTML page
$gUserAgent = LWP::UserAgent->new;
$gHTMLPage = $gUserAgent->request(GET SETI_URL);


#-- Extract the table
#--
#-- Note: TableExtract will handle tables nested within tables (outerm
+ost table is depth==0)
#--       as well as multiple tables within the same HTML document (fi
+rst table is count==0).
#--       Since the information I wish to extract is not nested, depth
+ will be 0, and since
#--       it is the second table on the page (the first table is the g
+roup description,
#--       web site, number of members, etc...,) the count will be 1.
#--
$gExtractedTable = HTML::TableExtract->new(depth => 0, count => 1);
$gExtractedTable->parse($gHTMLPage->content);


#-- Display information
foreach $gTable ($gExtractedTable->table_states)
{
  foreach $gRow ($gTable->rows)
  {

    #-- Print data row
    if ($$gRow[0]=~/^(\d+)\)\s*/)
    {
      ($gRank, $gName)=($1,$');
      $gName=$` if $gName=~/[\s\n]+$/;
      $$gRow[2]=$' if $$gRow[2]=~/^\s+/;

      print $gRank, FIELD_DELIM,
            $gName, FIELD_DELIM,
            $$gRow[1], FIELD_DELIM,
            $$gRow[2], FIELD_DELIM,
            $$gRow[3], "\n";
    }

    #-- Print header row
    else
    {
      $$gRow[1]=~tr/\n/ /;
      $$gRow[3]=~tr/\n/ /;

      print 'Rank', FIELD_DELIM,
            $$gRow[0], FIELD_DELIM,
            $$gRow[1], FIELD_DELIM,
            $$gRow[2], FIELD_DELIM,
            $$gRow[3], "\n";
    }
  }
}


#-- Exit
exit(ERR->{ok});


#-- End of script
[download]

In reply to Extracting information from the SETI@Home PM group by Rhose

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


"be consistent"
	PerlMonks