Someone asked for it, so here it is, the
HTML::TokeParser::Simple version
package HTML::LinkExtractor;
use strict;
use HTML::TokeParser::Simple 1;
use URI 1;
use Carp qw( croak );
use vars qw( $VERSION );
$VERSION = '0.03';
## The html tags which might have URLs
# the master list of tagolas and required attributes (to constitute a
+link)
use vars qw( %TAGS );
%TAGS = (
a => [qw( href )],
applet => [qw( archive code codebase src )],
area => [qw( href )],
base => [qw( href )],
bgsound => [qw( src )],
blockquote => [qw( cite )],
body => [qw( background )],
del => [qw( cite )],
div => [qw( src )], # IE likes it, but don't know where it
+'s documented
embed => [qw( pluginspage pluginurl src )],
form => [qw( action )],
frame => [qw( src longdesc )],
iframe => [qw( src )],
ilayer => [qw( background src )],
img => [qw( dynsrc longdesc lowsrc src usemap )],
input => [qw( dynsrc lowsrc src )],
ins => [qw( cite )],
isindex => [qw( action )], # real oddball
layer => [qw( src )],
link => [qw( src href )],
object => [qw( archive classid code codebase data usemap )],
q => [qw( cite )],
script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG
+!
sound => [qw( src )],
table => [qw( background )],
td => [qw( background )],
th => [qw( background )],
tr => [qw( background )],
## the exotic case
meta => undef,
);
## tags which contain <.*?> STUFF TO GET </\w+>
use vars qw( @TAGS_IN_NEED );
@TAGS_IN_NEED = qw(
a
blockquote
del
ins
q
);
use vars qw( @VALID_URL_ATTRIBUTES );
@VALID_URL_ATTRIBUTES = qw(
action
archive
background
cite
classid
code
codebase
data
dynsrc
href
longdesc
lowsrc
pluginspage
pluginurl
src
usemap
);
sub new {
my($class, $cb, $base) = @_;
my $self = {};
$self->{_cb} = $cb if defined $cb;
$self->{_base} = URI->new($base) if defined $base;
return bless $self, $class;
}
## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filec
+ontents);
sub parse {
my( $this, $hmmm ) = @_;
my $tp = new HTML::TokeParser::Simple( $hmmm );
unless($tp) {
croak qq[ Couldn't create a HTML::TokeParser::Simple object: $
+!];
}
$this->{_tp} = $tp;
$this->_parsola();
return();
}
sub _parsola {
my $self = shift;
my $IS_WE_OPEN = 0;
## a stack of links for keeping track of TEXT
## which is all of "<a href>text</a>"
my @TEXT = ();
$self->{_LINKS} = [];
# ["S", $tag, $attr, $attrseq, $text]
# ["E", $tag, $text]
# ["T", $text, $is_data]
# ["C", $text]
# ["D", $text]
# ["PI", $token0, $text]
while (my $T = $self->{_tp}->get_token() ) {
my $NewLink;
my $Tag = $T->return_tag;
## Start tag?
if($T->is_start_tag) {
next unless exists $TAGS{$Tag};
## Do we have a tag for which we want to capture text?
my $UNIQUE = 0;
$UNIQUE = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;
## then check to see if we got things besides META :)
if(defined $TAGS{ $Tag }) {
for my $Btag(@{$TAGS{$Tag}}) {
## and we check if they do have one with a value
if(exists $T->return_attr()->{ $Btag }) {
$NewLink = $T->return_attr();
## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
if($UNIQUE) {
push @TEXT, $NewLink;
$NewLink->{_TEXT} = "";
}
}
}
}elsif($Tag eq 'meta') {
$NewLink = $T->return_attr();
}
## In case we got nested tags
if(@TEXT) {
$TEXT[-1]->{_TEXT} .= $T->return_text;
}
## Text?
}elsif($T->is_text) {
$TEXT[-1]->{_TEXT} .= $T->return_text if @TEXT;
## Declaration?
}elsif($T->is_declaration) {
## We look at declarations, to get anly custom .dtd's (tis linky)
my $text - $T->return_text;
if( $text =~ m{ SYSTEM \s \" ( http://.* ) \" > $ }ix ) {
+#"
$NewLink = { raw => $text, url => $1};
}
## End tag?
}elsif($T->is_end_tag){
## these be ignored (maybe not in between <a...></a> tags
if(@TEXT) {
$TEXT[-1]->{_TEXT} .= $T->return_text;
my $pop = pop @TEXT;
$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
$self->{_cb}->($pop) if exists $self->{_cb};
}
}
if(defined $NewLink) {
$$NewLink{tag} = $Tag;
my $base = $self->{_base};
for my $attr( @VALID_URL_ATTRIBUTES ) {
$$NewLink{$attr} = URI->new_abs( $$NewLink{$attr}, $ba
+se )
if exists $$NewLink{$attr};
}
if(exists $self->{_cb}) {
$self->{_cb}->( $NewLink ) unless @TEXT;
} else {
push @{$self->{_LINKS}}, $NewLink;
}
}
}## endof while (my $token = $p->get_token)
undef $self->{_tp};
return();
}
sub links {
my $self = shift;
## just like HTML::LinkExtor's
return $self->{_LINKS};
}
# Preloaded methods go here.
1;
package main;
unless(caller()) {
my $p = new HTML::LinkExtractor(
sub {
print Dumper(shift);
},
);
my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
1 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
2 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.ht
+ml">
3 <base href="http://perl.org">
4 <a href="http://www.perlmonks.org">Perlmonks.org</a>
<p>
5 <a href="#BUTTER" href="#SCOTCH">
hello there
6 <img src="#AND" src="#PEANUTS">
7 <a href="#butter"> now </a>
</a>
8 <q CITE="http://www.shakespeare.com/">To be or not to be.</q>
9 <blockquote CITE="http://www.stonehenge.com/merlyn/">
Just Another Perl Hacker,
</blockquote>
};
$p->parse(\$INPUT);
$p = new HTML::LinkExtractor();
$p->parse(\$INPUT);
use Data::Dumper;
print scalar(@{$p->links()})." we GOT\n";
print Dumper( $p->links() );
}
__END__
=head1 NAME
HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> fr
+om an HTML document
=head1 DESCRIPTION
HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>,
except that besides getting the URL, you also get the link-text.
Example (please run the examples):
use HTML::LinkExtractor;
use Data::Dumper;
my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
my $p = new HTML::LinkExtractor();
$p->parse(\$input);
print Dumper($p->links);
__END__
# the above example will yield
$VAR1 = [
{
'_TEXT' => '<a href="http://perl.com/"> I am a LINK!!!
+ </a>',
'href' => bless(do{\(my $o = 'http://perl.com/')}, 'UR
+I::http'),
'tag' => 'a'
}
];
C<HTML::LinkExtractor> will also correctly extract nexted link-type ta
+gs.
=head1 SYNOPSIS
perl LinkExtractor.pm
## or
use HTML::LinkExtractor;
use LWP::Simple qw( get );
my $base = 'http://search.cpan.org';
my $html = get($base.'/recent');
my $p = new HTML::LinkExtractor();
$p->parse(\$html);
print qq{<base href="$base">\n};
for my $Link( @{ $p->links } ) {
## new modules are linked by /author/NAME/Dist
if( $$Link{href}=~ m{^\/author\/\w+} ) {
print $$Link{_TEXT}."\n";
}
}
undef $p;
__END__
=head1 METHODS
=head2 C<new>
Just like HTML::LinkExtor's new, it accepts 2 argument,
a callback ( a sub reference, as in C<sub{}>, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
after the closing tag is encountered )
and a base URL (it's up to you to make sure it's valid)
which is used to convert all relative URI's to absolute ones.
$ALinkP{href} = URI->new_abs( $ALink{href}, $base );
=head2 C<parse>
Each time you call C<parse>, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>
Each time you call C<parse> a new C<HTML::TokeParser::Simple> object
is created and stored in C<$this-E<gt>{_tp}>.
You shouldn't need to mess with the TokeParser object.
=head2 C<links>
Only after you call C<parse> will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)
$VAR1 = [ { type => 'img', src => 'image.png' }, ];
=head1 WHAT'S A LINK-type tag
Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.
Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)
Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>>
=head2 HOW CAN THAT BE?
I took at look at C<%HTML::Tagset::linkElements> and the following URL
+'s
http://www.blooberry.com/indexdot/html/tagindex/all.htm
http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
http://www.blooberry.com/indexdot/html/tagpages/a/area.htm
http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm
http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
http://www.blooberry.com/indexdot/html/tagpages/d/div.htm
http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm
http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
http://www.blooberry.com/indexdot/html/tagpages/l/link.htm
http://www.blooberry.com/indexdot/html/tagpages/o/object.htm
http://www.blooberry.com/indexdot/html/tagpages/q/q.htm
http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm
And the special cases
<!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
and
<meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.
+html">
http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm
=head1 AUTHOR
podmaster (see CPAN) aka crazyinsomniac@yahoo.com
=head1 SEE ALSO
L<HTML::LinkExtor>, L<HTML::TokeParser::Simple>, L<HTML::Tagset>.
=cut
____________________________________________________
** The Third rule of perl club is a statement of fact: pod is sexy.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.