comment on

Someone asked for it, so here it is, the HTML::TokeParser::Simple version
package HTML::LinkExtractor;

use strict;

use HTML::TokeParser::Simple 1;
use URI 1;
use Carp qw( croak );

use vars qw( $VERSION );
$VERSION = '0.03';

## The html tags which might have URLs
# the master list of tagolas and required attributes (to constitute a 
+link)
use vars qw( %TAGS );
%TAGS = (
              a => [qw( href )],
         applet => [qw( archive code codebase src )],
           area => [qw( href )],
           base => [qw( href )],
        bgsound => [qw( src )],
     blockquote => [qw( cite )],
           body => [qw( background )],
            del => [qw( cite )],
            div => [qw( src )], # IE likes it, but don't know where it
+'s documented
          embed => [qw( pluginspage pluginurl src )],
           form => [qw( action )],
          frame => [qw( src longdesc  )],
         iframe => [qw( src )],
         ilayer => [qw( background src )],
            img => [qw( dynsrc longdesc lowsrc src usemap )],
          input => [qw( dynsrc lowsrc src )],
            ins => [qw( cite )],
        isindex => [qw( action )], # real oddball
          layer => [qw( src )],
           link => [qw( src href )],
         object => [qw( archive classid code codebase data usemap )],
              q => [qw( cite )],
         script => [qw( src  )], # HTML::Tagset has 'for' ~ it's WRONG
+!
          sound => [qw( src )],
          table => [qw( background )],
             td => [qw( background )],
             th => [qw( background )],
             tr => [qw( background )],
  ## the exotic case
           meta => undef,
);

## tags which contain <.*?> STUFF TO GET </\w+>
use vars qw( @TAGS_IN_NEED );
@TAGS_IN_NEED = qw(
    a
    blockquote
    del
    ins
    q
);

use vars qw( @VALID_URL_ATTRIBUTES );
@VALID_URL_ATTRIBUTES = qw(
        action
        archive
        background
        cite
        classid
        code
        codebase
        data
        dynsrc
        href
        longdesc
        lowsrc
        pluginspage
        pluginurl
        src
        usemap
);


sub new {
    my($class, $cb, $base) = @_;
    my $self = {};

    $self->{_cb} = $cb if defined $cb;
    $self->{_base} = URI->new($base) if defined $base;

    return bless $self, $class;
}

## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filec
+ontents);

sub parse {
    my( $this, $hmmm ) = @_;
    my $tp = new HTML::TokeParser::Simple( $hmmm );

    unless($tp) {
        croak qq[ Couldn't create a HTML::TokeParser::Simple object: $
+!];
    }

    $this->{_tp} = $tp;

    $this->_parsola();
    return();
}

sub _parsola {
    my $self = shift;
    my $IS_WE_OPEN = 0;

## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
    my @TEXT = ();
    $self->{_LINKS} = [];


#  ["S",  $tag, $attr, $attrseq, $text]
#  ["E",  $tag, $text]
#  ["T",  $text, $is_data]
#  ["C",  $text]
#  ["D",  $text]
#  ["PI", $token0, $text]

    while (my $T = $self->{_tp}->get_token() ) {
        my $NewLink;
        my $Tag = $T->return_tag;
## Start tag?
        if($T->is_start_tag) {
            next unless exists $TAGS{$Tag};

## Do we have a tag for which we want to capture text?
            my $UNIQUE = 0;
            $UNIQUE = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;

## then check to see if we got things besides META :)
            if(defined $TAGS{ $Tag }) {
                for my $Btag(@{$TAGS{$Tag}}) {
## and we check if they do have one with a value
                    if(exists $T->return_attr()->{ $Btag }) {
                        $NewLink = $T->return_attr();
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
                        if($UNIQUE) {
                            push @TEXT, $NewLink;
                            $NewLink->{_TEXT} = "";
                        }
                    }
                }
            }elsif($Tag eq 'meta') {
                $NewLink = $T->return_attr();
            }

    ## In case we got nested tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $T->return_text;
            }

## Text?
        }elsif($T->is_text) {
            $TEXT[-1]->{_TEXT} .= $T->return_text if @TEXT;
## Declaration?
        }elsif($T->is_declaration) {
## We look at declarations, to get anly custom .dtd's (tis linky)
            my $text - $T->return_text;
            if( $text =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) { 
+#"
                $NewLink = { raw => $text, url => $1};
            }
## End tag?
        }elsif($T->is_end_tag){
## these be ignored (maybe not in between <a...></a> tags
            if(@TEXT) {
                $TEXT[-1]->{_TEXT} .= $T->return_text;
                my $pop = pop @TEXT;
                $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
                $self->{_cb}->($pop) if exists $self->{_cb};
            }
        }

        if(defined $NewLink) {
            $$NewLink{tag} = $Tag;

            my $base = $self->{_base};

            for my $attr( @VALID_URL_ATTRIBUTES ) {
                $$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba
+se )
                    if exists $$NewLink{$attr};
            }

            if(exists $self->{_cb}) {
                $self->{_cb}->( $NewLink ) unless @TEXT;
            } else {
                push @{$self->{_LINKS}}, $NewLink;
            }
        }
    }## endof while (my $token = $p->get_token)

    undef $self->{_tp};
    return();
}

sub links {
    my $self = shift;
    ## just like HTML::LinkExtor's
    return $self->{_LINKS};
}

# Preloaded methods go here.

1;

package main;

unless(caller()) {

    my $p = new HTML::LinkExtractor(
        sub {
            print Dumper(shift);
        },
    );

    my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.

1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht
+ml">
3 <base href="http://perl.org">
4 <a href="http://www.perlmonks.org">Perlmonks.org</a>
<p>

5 <a href="#BUTTER"  href="#SCOTCH">
    hello there
6 <img src="#AND" src="#PEANUTS">
7    <a href="#butter"> now </a>
</a>

8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q>
9 <blockquote CITE="http://www.stonehenge.com/merlyn/">
    Just Another Perl Hacker,
</blockquote>
    };

    $p->parse(\$INPUT);

    $p = new HTML::LinkExtractor();
    $p->parse(\$INPUT);

    use Data::Dumper;
    print scalar(@{$p->links()})." we GOT\n";
    print Dumper( $p->links() );    
    
}

__END__



=head1 NAME

HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> fr
+om an HTML document

=head1 DESCRIPTION

HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>,
except that besides getting the URL, you also get the link-text.

Example (please run the examples):

    use HTML::LinkExtractor;
    use Data::Dumper;
    
    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
    my $p = new HTML::LinkExtractor();
    
    $p->parse(\$input);
    
    print Dumper($p->links);
    __END__
    # the above example will yield
    $VAR1 = [
              {
                '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!!
+ </a>',
                'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR
+I::http'),
                'tag' => 'a'
              }
            ];

C<HTML::LinkExtractor> will also correctly extract nexted link-type ta
+gs.

=head1 SYNOPSIS

    perl LinkExtractor.pm
    
    ## or

    use HTML::LinkExtractor;
    use LWP::Simple qw( get );

    my $base = 'http://search.cpan.org';
    my $html = get($base.'/recent');
    my $p = new HTML::LinkExtractor();

    $p->parse(\$html);

    print qq{<base href="$base">\n};

    for my $Link( @{ $p->links } ) {
    ## new modules are linked  by /author/NAME/Dist
        if( $$Link{href}=~ m{^\/author\/\w+} ) {
            print $$Link{_TEXT}."\n";
        }
    }

    undef $p;
    __END__


=head1 METHODS

=head2 C<new>

Just like HTML::LinkExtor's new, it accepts 2 argument,
a callback ( a sub reference, as in C<sub{}>, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
 after the closing tag is encountered )

and a base URL (it's up to you to make sure it's valid)
which is used to convert all relative URI's to absolute ones.

    $ALinkP{href} = URI->new_abs( $ALink{href}, $base );

=head2 C<parse>

Each time you call C<parse>, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>

Each time you call C<parse> a new C<HTML::TokeParser::Simple> object
is created and stored in C<$this-E<gt>{_tp}>.

You shouldn't need to mess with the TokeParser object.

=head2 C<links>

Only after you call C<parse> will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)

    $VAR1 = [ { type => 'img', src => 'image.png' }, ];


=head1 WHAT'S A LINK-type tag

Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.

Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)

Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>>


=head2 HOW CAN THAT BE?

I took at look at C<%HTML::Tagset::linkElements> and the following URL
+'s

    http://www.blooberry.com/indexdot/html/tagindex/all.htm

    http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
    http://www.blooberry.com/indexdot/html/tagpages/a/area.htm

    http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
    http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm

    http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
    http://www.blooberry.com/indexdot/html/tagpages/d/div.htm

    http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
    http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm

    http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
    http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm

    http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
    http://www.blooberry.com/indexdot/html/tagpages/l/link.htm

    http://www.blooberry.com/indexdot/html/tagpages/o/object.htm

    http://www.blooberry.com/indexdot/html/tagpages/q/q.htm

    http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
    http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm

    And the special cases 

    <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
    http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
    
    and

    <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.
+html">
    http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm


=head1 AUTHOR

podmaster (see CPAN) aka crazyinsomniac@yahoo.com

=head1 SEE ALSO

L<HTML::LinkExtor>, L<HTML::TokeParser::Simple>, L<HTML::Tagset>.

=cut
[download]
____________________________________________________
** The Third rule of perl club is a statement of fact: pod is sexy.
In reply to Re: HTML::LinkExtractor by PodMaster
in thread HTML::LinkExtractor by PodMaster
Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.

good chemistry is complicated, and a little bit messy -LW
	PerlMonks