I am wondering how to find the values of an attribute of an element which has been extracted using XML::XPath::XMLParser.
If I have the following program, I can get the element name with ${$node}->[5]. But what variable would I need to express to get the values of the 'href' attributes in the 'a' elements?
#!/usr/bin/perl
use HTML::Tidy;
use XML::XPath;
use XML::XPath::XMLParser;
use Data::Dumper;
use strict;
use warnings;
my $body;
while(my $line = <DATA>) {
$body .= $line;
}
my $tidy = HTML::Tidy->new({output_xml=>1,numeric_entities=>1});
my $clean = $tidy->clean($body);
my $parser = XML::XPath->new(xml => $clean);
my $set = '//p/a';
my $nodes = $parser->find($set);
foreach my $node ($nodes->get_nodelist) {
print "\n";
print "FOUND\n\n", XML::XPath::XMLParser::as_string($node),"\n";
print Dumper($node);
# print ${$node}->[5],"\n"; # element name
}
exit(0);
_DATA__
<!doctype html>
<html class="no-focus-outline no-js"
lang="en-US" data-modal-active="true">
<head>
<title>test</title>
</head>
<body>
<h1>test heading</h1
<div>
<p>paragraph one
<a href="https://example.com/one/two.html">one</a> example.</p>
<p>paragraph two
<a href="https://example.com/two/three.html">another</a> example.</p>
</div>
</body>
</html>
An excerpt from XML::XPath::XMLParser describes the attribute structure like this:
...
Element Node
[
$parent, # node_parent
<position in current array>, # node_pos
'xxx', # node_prefix - namespace prefix on this element
[ ... ], # node_children
'yyy', # node_name - element tag name
[ ... ], # node_attribs - attributes on this element
[ ... ], # node_namespaces - namespaces currently in scop
+e
]
Attribute Node
[
$parent, # node_parent - the element node
<position in current array>, # node_pos
'xxx', # node_prefix - namespace prefix on this element
'href', # node_key - attribute name
'ftp://ftp.com/', # node_value - value in the node
]
...
The output from Data::Dumper for $node variables is like this:
FOUND
<a href="https://example.com/one/two.html">one</a>
$VAR1 = bless( do{\(my $o = bless( [
bless( [
bless( [
bless( [
bless( [
bless( [
+ undef,
+ undef,
+ 5,
+ undef,
+ [
+ bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0]->[0])}, 'XML::XPath::N
+ode::Element' )
+ ],
+ undef,
+ [],
+ [
+ bless( do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0]->[0],
+ 0,
+ 10,
+ 'xml',
+ 'http://www.w3.org/XML/1998/namespace'
+ ], 'XML::XPath::Node::NamespaceImpl' ))}, 'XML::XP
+ath::Node::Namespace' )
+ ]
]
+, 'XML::XPath::Node::ElementImpl' ),
0,
15,
'',
[
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 0,
+ 35,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 1,
+ 40,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 0,
+ 45,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 1,
+ 50,
+ '',
+ [],
+ 'meta',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[1]},
+ 0,
+ 55,
+ '',
+ 'name',
+ 'generator'
+ ], 'XML::XPath::Nod
+e::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ),
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[1]},
+ 1,
+ 60,
+ '',
+ 'content',
+ 'HTML Tidy for HT
+ML5 for Linux version 5.6.0'
+ ], 'XML::XPath::Nod
+e::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' )
+ ]
+ ], 'XML::XPath::Node::ElementImpl' ))},
+ 'XML::XPath::Node::Element' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 2,
+ 65,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 3,
+ 70,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[3]},
+ 0,
+ 75,
+ 'test'
+ ], 'XML::XPath::Nod
+e::TextImpl' ))}, 'XML::XPath::Node::Text' )
+ ],
+ 'title',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))},
+ 'XML::XPath::Node::Element' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 4,
+ 80,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' )
+ ],
+ 'head',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::
+Element' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 2,
+ 85,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' ),
bless(
+ do{\(my $o = ${$VAR1}->[0]->[0]->[0])}, 'XML::XPath::Node::Element'
+),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 4,
+ 200,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' )
],
'html',
[
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 0,
+ 20,
+ '',
+ 'class',
+ 'no-focus-outline no-js'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 1,
+ 25,
+ '',
+ 'lang',
+ 'en-US'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 2,
+ 30,
+ '',
+ 'data-modal-active',
+ 'true'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' )
]
], 'XML::X
+Path::Node::ElementImpl' ),
3,
90,
'',
[
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 0,
+ 95,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 1,
+ 100,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[4]->[1]},
+ 0,
+ 105,
+ "test\x{a0}heading"
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath
+::Node::Text' )
+ ],
+ 'h1',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element'
+),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 2,
+ 110,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my
+$o = ${$VAR1}->[0]->[0])}, 'XML::XPath::Node::Element' ),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 4,
+ 195,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'body',
[]
], 'XML::XPath::Nod
+e::ElementImpl' ),
3,
115,
'',
[
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
0,
120,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my $o = ${$V
+AR1}->[0])}, 'XML::XPath::Node::Element' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
2,
155,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
3,
160,
'',
[
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[3]},
+ 0,
+ 165,
+ 'paragraph two
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::T
+ext' ),
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[3]},
+ 1,
+ 170,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]->[0]->[4]->[3]}->
+[4]->[1]},
+ 0,
+ 180,
+ 'another'
+ ], 'XML::XPath::Node::TextImpl' ))},
+'XML::XPath::Node::Text' )
+ ],
+ 'a',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]->[0]->[4]->[3]}->
+[4]->[1]},
+ 0,
+ 175,
+ '',
+ 'href',
+ 'https://example.com/two/three.html
+'
+ ], 'XML::XPath::Node::AttributeImpl'
+))}, 'XML::XPath::Node::Attribute' )
+ ]
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node
+::Element' ),
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[3]},
+ 2,
+ 185,
+ ' example.'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::T
+ext' )
],
'p',
[]
], 'XML:
+:XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
4,
190,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'div',
[]
], 'XML::XPath::Node::Elemen
+tImpl' ),
1,
125,
'',
[
bless( do{\(my $o = bless( [
${$VAR1}->[0],
0,
130,
'paragraph one
'
], 'XML::XPath::N
+ode::TextImpl' ))}, 'XML::XPath::Node::Text' ),
$VAR1,
bless( do{\(my $o = bless( [
${$VAR1}->[0],
2,
150,
' example.'
], 'XML::XPath::N
+ode::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'p',
[]
], 'XML::XPath::Node::ElementImpl' ),
1,
135,
'',
[
bless( do{\(my $o = bless( [
${$VAR1},
0,
145,
'one'
], 'XML::XPath::Node::Text
+Impl' ))}, 'XML::XPath::Node::Text' )
],
'a',
[
bless( do{\(my $o = bless( [
${$VAR1},
0,
140,
'',
'href',
'https://example.com/one
+/two.html'
], 'XML::XPath::Node::Attr
+ibuteImpl' ))}, 'XML::XPath::Node::Attribute' )
],
[]
], 'XML::XPath::Node::ElementImpl' ))}, 'XML::
+XPath::Node::Element' );
FOUND
<a href="https://example.com/two/three.html">another</a>
$VAR1 = bless( do{\(my $o = bless( [
bless( [
bless( [
bless( [
bless( [
bless( [
+ undef,
+ undef,
+ 5,
+ undef,
+ [
+ bless( do{\(my $o = ${$VAR1}->[0]->[0]->[0]->[0])}, 'XML::XPath::N
+ode::Element' )
+ ],
+ undef,
+ [],
+ [
+ bless( do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0]->[0],
+ 0,
+ 10,
+ 'xml',
+ 'http://www.w3.org/XML/1998/namespace'
+ ], 'XML::XPath::Node::NamespaceImpl' ))}, 'XML::XP
+ath::Node::Namespace' )
+ ]
]
+, 'XML::XPath::Node::ElementImpl' ),
0,
15,
'',
[
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 0,
+ 35,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 1,
+ 40,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 0,
+ 45,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 1,
+ 50,
+ '',
+ [],
+ 'meta',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[1]},
+ 0,
+ 55,
+ '',
+ 'name',
+ 'generator'
+ ], 'XML::XPath::Nod
+e::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' ),
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[1]},
+ 1,
+ 60,
+ '',
+ 'content',
+ 'HTML Tidy for HT
+ML5 for Linux version 5.6.0'
+ ], 'XML::XPath::Nod
+e::AttributeImpl' ))}, 'XML::XPath::Node::Attribute' )
+ ]
+ ], 'XML::XPath::Node::ElementImpl' ))},
+ 'XML::XPath::Node::Element' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 2,
+ 65,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 3,
+ 70,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]
+->[0]->[0]->[0]->[4]->[1]}->[4]->[3]},
+ 0,
+ 75,
+ 'test'
+ ], 'XML::XPath::Nod
+e::TextImpl' ))}, 'XML::XPath::Node::Text' )
+ ],
+ 'title',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))},
+ 'XML::XPath::Node::Element' ),
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[0]->[4]->
+[1]},
+ 4,
+ 80,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'X
+ML::XPath::Node::Text' )
+ ],
+ 'head',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::
+Element' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 2,
+ 85,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' ),
bless(
+ do{\(my $o = ${$VAR1}->[0]->[0]->[0])}, 'XML::XPath::Node::Element'
+),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 4,
+ 200,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Tex
+t' )
],
'html',
[
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 0,
+ 20,
+ '',
+ 'class',
+ 'no-focus-outline no-js'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 1,
+ 25,
+ '',
+ 'lang',
+ 'en-US'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' ),
bless(
+ do{\(my $o = bless( [
+ ${$VAR1}->[0]->[0]->[0]->[0],
+ 2,
+ 30,
+ '',
+ 'data-modal-active',
+ 'true'
+ ], 'XML::XPath::Node::AttributeImpl' ))}, 'XML::XPath::Node
+::Attribute' )
]
], 'XML::X
+Path::Node::ElementImpl' ),
3,
90,
'',
[
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 0,
+ 95,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 1,
+ 100,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[0]->[4]->[1]},
+ 0,
+ 105,
+ "test\x{a0}heading"
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath
+::Node::Text' )
+ ],
+ 'h1',
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element'
+),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 2,
+ 110,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my
+$o = ${$VAR1}->[0]->[0])}, 'XML::XPath::Node::Element' ),
bless( do{\(my
+$o = bless( [
+ ${$VAR1}->[0]->[0]->[0],
+ 4,
+ 195,
+ '
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'body',
[]
], 'XML::XPath::Nod
+e::ElementImpl' ),
3,
115,
'',
[
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
0,
120,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
1,
125,
'',
[
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[1]},
+ 0,
+ 130,
+ 'paragraph one
'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::T
+ext' ),
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[1]},
+ 1,
+ 135,
+ '',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]->[0]->[4]->[1]}->
+[4]->[1]},
+ 0,
+ 145,
+ 'one'
+ ], 'XML::XPath::Node::TextImpl' ))},
+'XML::XPath::Node::Text' )
+ ],
+ 'a',
+ [
+ bless( do{\(my $o = bless( [
+ ${${${$VAR1}->[0]->[0]->[4]->[1]}->
+[4]->[1]},
+ 0,
+ 140,
+ '',
+ 'href',
+ 'https://example.com/one/two.html'
+ ], 'XML::XPath::Node::AttributeImpl'
+))}, 'XML::XPath::Node::Attribute' )
+ ],
+ []
+ ], 'XML::XPath::Node::ElementImpl' ))}, 'XML::XPath::Node
+::Element' ),
bles
+s( do{\(my $o = bless( [
+ ${${$VAR1}->[0]->[0]->[4]->[1]},
+ 2,
+ 150,
+ ' example.'
+ ], 'XML::XPath::Node::TextImpl' ))}, 'XML::XPath::Node::T
+ext' )
],
'p',
[]
], 'XML:
+:XPath::Node::ElementImpl' ))}, 'XML::XPath::Node::Element' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
2,
155,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' ),
bless( do{\(my $o = ${$V
+AR1}->[0])}, 'XML::XPath::Node::Element' ),
bless( do{\(my $o = bles
+s( [
${$VAR
+1}->[0]->[0],
4,
190,
'
'
], 'XML:
+:XPath::Node::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'div',
[]
], 'XML::XPath::Node::Elemen
+tImpl' ),
3,
160,
'',
[
bless( do{\(my $o = bless( [
${$VAR1}->[0],
0,
165,
'paragraph two
'
], 'XML::XPath::N
+ode::TextImpl' ))}, 'XML::XPath::Node::Text' ),
$VAR1,
bless( do{\(my $o = bless( [
${$VAR1}->[0],
2,
185,
' example.'
], 'XML::XPath::N
+ode::TextImpl' ))}, 'XML::XPath::Node::Text' )
],
'p',
[]
], 'XML::XPath::Node::ElementImpl' ),
1,
170,
'',
[
bless( do{\(my $o = bless( [
${$VAR1},
0,
180,
'another'
], 'XML::XPath::Node::Text
+Impl' ))}, 'XML::XPath::Node::Text' )
],
'a',
[
bless( do{\(my $o = bless( [
${$VAR1},
0,
175,
'',
'href',
'https://example.com/two
+/three.html'
], 'XML::XPath::Node::Attr
+ibuteImpl' ))}, 'XML::XPath::Node::Attribute' )
],
[]
], 'XML::XPath::Node::ElementImpl' ))}, 'XML::
+XPath::Node::Element' );