Just for fun, here's a low-level solution using vanilla HTML::Parser.
use strict;
use warnings;
use HTML::Parser;
my @html = (<<EOT
<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<script>
console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
*/ </script>
<a href="http://www.example.com/7">Se<span
>ve</span>n</a>
<script>/* --> */</script>
EOT
, <<EOT
<a
href
=
"http://www.example.com/1"
>
One
</a
>
<a id="Two" title="href="></a>
<!--
<a href="http://www.example.com/3">Three</a>
-->
<a title=' href="http://www.example.com/4">Four'
href="http://www.example.com/5">Five</a>
<script type="text/javascript">/*<![CDATA[
</script>
*/ console.log(' <a href="http://www.example.com/6">Six</a> '); /*
<!--
]]>*/</script>
<a href="http://www.example.com/7"><![CDATA[Se]]><span
>ve</span>n</a>
<script type="text/javascript">/*<![CDATA[
-->
]]>*/</script>
<![CDATA[
<a href="http://www.example.com/8">Eight</a>
]]>
EOT
);
my $state = 0;
my $p = HTML::Parser->new (
api_version => 3,
start_h => [ sub {
shift eq 'a' or return;
my $href = shift->{href} or return;
$state = 1;
print "$href\t";
shift->handler (text => sub {
print trim(shift);
}, 'dtext, self');
}, 'tagname, attr, self'],
end_h => [ sub {
return unless shift eq 'a' && $state;
$state = 0;
print "\n";
shift->handler (text => '');
}, 'tagname, self'],
);
print "HTML:\n";
$p->parse ($html[0]);
print "XHTML:\n";
$p->xml_mode (1);
$p->marked_sections (1);
$p->parse ($html[1]);
sub trim {
(my $str = shift) =~ s/^\s+|\s+$//g;
return $str;
}