#!/usr/bin/perl
=head1 NAME
exp -- extract XPath matches from XML data
=head1 SYNOPSIS
exp [-x] [-r] {-p xpath_spec | -f xpath.list} file.xml
=head1 DESCRIPTION
This program will print portions (if any) from an XML file that match
a given XPath specifier. You can provide an XPath spec on the command
line (via the "-p /path" option), or read a list of one or more path
specs from a file (via the "-f path.list" option). The matching
content is always printed to STDOUT.
By default, exp prints just the text content of matches. Use the "-x"
option to print full XML strings.
In case your "file.xml" happens to be a concatenation of elements
without a root tag (e.g. a stream of .........),
you can use the "-r" option to provide a root tag automatically before
parsing.
=head1 ASSUMPTIONS
We assume that the character encoding is utf8 (and we set STDOUT
accordingly). We also assume that the entire XML input fits in memory
(we use XML::LibXML, which tend to be fairly compact); in the case of
using the -r option, the XML source text becomes memory resident as
well, before passing it to XML::LibXML for parsing.
=head1 AUTHOR
David Graff
=cut
use strict;
use XML::LibXML;
use Getopt::Long;
binmode STDOUT,":utf8";
my $Usage = "Usage: $0 [-x] [-r] {-p xpath_spec | -f xpath.list} file.xml\n";
my %opt;
die $Usage unless ( GetOptions( \%opt, 'x', 'r', 'p=s', 'f=s' ) and
@ARGV == 1 and -f $ARGV[0] and ( $opt{p} or $opt{f} ));
my $xmlfile = shift;
my @paths;
push @paths, $opt{p} if ( $opt{p} =~ /\w/ );
if ( $opt{f} ) {
open( my $list, "<", $opt{f} ) or die "$opt{f}: $!\n";
while (<$list>) {
next if ( /^\s*[;\#-]/ or !/\w/ );
chomp;
push @paths, $_;
}
}
my $xml = XML::LibXML->new;
my $doc;
if ( ! $opt{r} ) {
$doc = $xml->parse_file( $xmlfile );
}
else {
my $xmlstr = "";
{
local $/;
open( X, '<:utf8', $xmlfile ) or die "Unable to read $xmlfile: $!\n";
$xmlstr .= ;
close X;
}
$xmlstr .= "";
s{^(?!//)}{/EXP_ROOT_$$} for ( @paths );
$doc = $xml->parse_string( $xmlstr );
}
my $pth = XML::LibXML::XPathContext->new( $doc );
for my $p ( @paths ) {
for my $n ( $pth->findnodes( $p )) {
if ( $opt{x} ) {
print $n->toString, "\n";
} else {
print $n->textContent, "\n";
}
}
}