1: =head1 NAME
2:
3: XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple
4:
5: =head1 DESCRIPTION
6:
7: Get the benefits of L<XML::TokeParser::TOKEN|"What is XML::TokeParser::TOKEN">.
8:
9: use XML::TokeParser::Simple and get the blessing ;)
10:
11: Hopefully XML::TokeParser will add this in the next version.
12:
13:
14: =head1 SYNOPSIS
15:
16: # file: printComments.pl
17: # desc: prints all the comments from an xml file
18: use XML::TokeParser::Simple;
19: my $p = new XML::TokeParser::Simple('file.xml');
20: while(defined( my $t = $p->get_token )) {
21: print $t->text,"\n" if $t->is_comment;
22: }
23:
24: See L<"What is XML::TokeParser::TOKEN">
25:
26: =cut
27:
28: package XML::TokeParser::Simple;
29: use XML::TokeParser();
30: use vars qw/ @ISA $VERSION /;
31: $VERSION = '0.01';
32: @ISA = qw/ XML::TokeParser /;
33:
34: sub get_token {
35: my $self = shift;
36: my $token = $self->SUPER::get_token( @_ );
37: return unless defined $token;
38: bless $token, 'XML::TokeParser::Token';
39: }
40:
41: sub get_tag {
42: my $self = shift;
43: my $token = $self->SUPER::get_tag( @_ );
44: return unless defined $token;
45: bless $token, 'XML::TokeParser::Token';
46: }
47:
48: package XML::TokeParser::Token;
49: use strict;
50:
51: =head1 What is XML::TokeParser::TOKEN
52:
53: A token is a blessed array reference,
54: that you acquire using $p->get_token,
55: and that might look like:
56:
57: ["S", $tag, $attr, $attrseq, $raw]
58: ["E", $tag, $raw]
59: ["T", $text, $raw]
60: ["C", $text, $raw]
61: ["PI", $target, $data, $raw]
62:
63: If you don't like remembering array indices,
64: you may access the attributes of a token like:
65:
66: $p->get_token->tag, $t->attr, $t->attrseg, $t->raw ...
67:
68: =head2 Methods
69:
70: Tokens may be inspected using any of these is_* methods
71:
72: is_text
73: is_comment
74: is_pi
75: is_process_instruction
76: is_start_tag
77: is_end_tag
78: is_tag
79:
80: like:
81:
82: print $t->target if $t->is_pi;
83: print "The comment says ", $t->text if $t->is_comment;
84:
85: =cut
86:
87: # for PI
88: sub target { return $_[0]->[1] if $_[0]->is_pi; }
89: sub data { return $_[0]->[2] if $_[0]->is_pi; }
90: sub raw { return $_[0]->[-1]; }
91:
92: #for S
93: sub attr { return $_[0]->[2] if $_[0]->is_start_tag(); }
94: sub attrseq { return $_[0]->[3] if $_[0]->is_start_tag(); }
95:
96: #for S|E
97: sub tag { return $_[0]->[1] if $_[0]->is_tag; }
98:
99: #for C|T
100: sub text { return $_[0]->[1] if $_[0]->is_text or $_[0]->is_comment; }
101:
102: # test your token
103: sub is_text { return 1 if $_[0]->[0] eq 'T'; }
104: sub is_comment { return 1 if $_[0]->[0] eq 'C'; }
105: sub is_pi { return 1 if $_[0]->[0] eq 'PI'; }
106: sub is_process_instruction { goto &is_pi; }
107: sub is_start_tag { return $_[0]->_is( S => $_[1] ); }
108: sub is_end_tag { return $_[0]->_is( E => $_[1] ); }
109: sub is_tag { return $_[0]->_is( S => $_[1] )
110: || $_[0]->_is( E => $_[1] ); }
111:
112: sub _is {
113: if($_[0]->[0] eq $_[1]){
114: if(defined $_[2]){
115: return 1 if $_[0]->[1] eq $_[2];
116: }else{
117: return 1;
118: }
119: }
120: return 0;
121: }
122:
123: 1;
124:
125: =head1 DEMO
126:
127: execute this file as if it were a script, as in C<perl WhateverYouSavedItAs>,
128: and you'll see how/that this module works.
129:
130: =cut
131:
132: package main;
133:
134: unless(caller()){
135: use Data::Dumper;
136: my $file = 'REC-xml-19980210.xml';
137: $file = \ q[<p>
138: <scrap lang='ebnf' id='document'>
139: <head>Document</head>
140: <prod id='NT-document'><lhs>document</lhs>
141: <rhs><nt def='NT-prolog'>prolog</nt>
142: <nt def='NT-element'>element</nt>
143: <nt def='NT-Misc'>Misc</nt>*</rhs></prod>
144: </scrap>
145: </p>];
146: ## Cause chances are you won't have
147: ## http://www.w3.org/TR/1998/REC-xml-19980210.xml
148: ## as referenced in
149: ## http://www.xmltwig.com/article/ways_to_rome/ways_to_rome.html
150: ## in the current directory
151:
152:
153:
154: my $i = 0;
155: my $p = XML::TokeParser::Simple->new($file);
156:
157: my $Ret = "";
158:
159: while(defined(my $t = $p->get_token() )){
160:
161: if( $t->is_start_tag('lhs') ){
162: $i++;
163: $Ret = join '', "[$i] ", $p->get_text('/lhs'), " ::= ";
164: }elsif( $t->is_start_tag('rhs') ){
165: $Ret .= $p->get_text('/rhs');
166: }elsif( $t->is_end_tag('prod') ){
167: print clean($Ret),"\n";
168: $Ret = "";
169: }
170: }
171:
172: undef $Ret;
173: undef $p;
174:
175: ## mirod already did this, so I'm borrowing
176:
177: sub prod {
178: my( $twig, $prod)= @_;
179: my $lhs= $prod->field( 'lhs');
180: my $rhs= join '', map {$_->text} $prod->children( 'rhs');
181:
182: $i++;
183: my $prod_text = "[$i] $lhs ::= $rhs";
184: print clean( $prod_text) . "\n";
185: }
186:
187:
188: sub clean {
189: my( $string)= @_;
190: $string =~ s/\xc2\xa0/ /sg;
191: $string =~ s/\s+/ /g; $string=~ s{\s$}{}g;
192: return $string;
193: }
194: }
195:
196: 1;
197:
198: =head1 SEE ALSO
199:
200: L<XML::TokeParser>, L<HTML::TokeParser>, L<HTML::TokeParser::Simple>,
201: L<XML::Twig>
202:
203: =head1 AUTHOR
204:
205: D.H. <PodMaster@cpan.org>
206:
207: =head1 LICENSE
208:
209: copyright (c) D.H. 2002 All rights reserved.
210:
211: This program is released under the same terms as perl itself.
212: If you don't know what that means, visit http://perl.com
213: or execute "perl -v" at a commandline (assuming you have perl installed).
214:
215: =cut