#!/usr/bin/perl
use strict;
use warnings;
use HTML::TokeParser::Simple;
my $html = join '', <DATA>;
my $p = HTML::TokeParser::Simple->new(\$html);
my ($title, $content, $keywords);
while (my $t = $p->get_token){
last if $t->is_start_tag('body');
$title = $p->get_trimmed_text('/title')
if $t->is_start_tag('title');
$content = $t->get_attr('content') if
$t->is_start_tag('meta') and
$t->get_attr('name') and
$t->get_attr('name') eq 'Description';
$keywords = $t->get_attr('content') if
$t->is_start_tag('meta') and
$t->get_attr('name') and
$t->get_attr('name') eq 'Keywords';
}
print "title: $title\n";
print "content: $content\n";
my $tag;
while (my $t = $p->get_token) {
$tag = $t->get_tag
if $t->is_start_tag(qr/^h[123456]|[biua]$/);
if ($t->is_start_tag('img') and $t->get_attr('alt')){
my $attr = $t->get_attr('alt');
print "img attr: $attr\n";
$tag = '';
}
elsif ($tag and $t->is_text){
my $txt = $t->as_is;
print "$tag: $txt\n";
$tag = '';
}
}
__DATA__
<html>
<head>
<title>henka's test page</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1
+" />
<meta name="Description" content="the glories of HTML::TokeParser::Sim
+ple" />
<meta name="keywords" content="one two three four five six seven eight
+ nine ten" />
<meta name="robots" content="noindex" />
<link rel="stylesheet" type="text/css" href="cwi.css" />
</head>
<body>
<h1>header one</h1>
<h2>header two</h2>
<h3>header three</h3>
<h4>header four</h4>
<h5>header five</h5>
<h6>header siz</h6>
<p>p tag paragraph</p>
<p>p tag containing <u>underline</u> and <b>bold</b> and a <a href="li
+nk.html">link</a></p>
<img alt="image alt text" src="my.gif">
</body>
</html>
output:
---------- Capture Output ----------
> "c:\perl\bin\perl.exe" monk06.pl
title: henka's test page
content: the glories of HTML::TokeParser::Simple
h1: header one
h2: header two
h3: header three
h4: header four
h5: header five
h6: header siz
u: underline
b: bold
a: link
img attr: image alt text
> Terminated with exit code 0.
|