Beefy Boxes and Bandwidth Generously Provided by pair Networks
P is for Practical
 
PerlMonks  

XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple

by PodMaster (Abbot)
on Nov 19, 2002 at 15:47 UTC ( [id://214158]=perlcraft: print w/replies, xml ) Need Help??

   1: =head1 NAME
   2: 
   3: XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple
   4: 
   5: =head1 DESCRIPTION
   6: 
   7: Get the benefits of L<XML::TokeParser::TOKEN|"What is XML::TokeParser::TOKEN">.
   8: 
   9: use XML::TokeParser::Simple and get the blessing ;)
  10: 
  11: Hopefully XML::TokeParser will add this in the next version.
  12: 
  13: 
  14: =head1 SYNOPSIS
  15: 
  16:     # file: printComments.pl
  17:     # desc: prints all the comments from an xml file
  18:     use XML::TokeParser::Simple;
  19:     my $p = new XML::TokeParser::Simple('file.xml');
  20:     while(defined( my $t = $p->get_token )) {
  21:         print $t->text,"\n" if $t->is_comment;
  22:     }
  23: 
  24: See L<"What is XML::TokeParser::TOKEN">
  25: 
  26: =cut
  27:  
  28: package XML::TokeParser::Simple;
  29: use XML::TokeParser();
  30: use vars qw/ @ISA $VERSION /;
  31: $VERSION = '0.01';
  32: @ISA = qw/ XML::TokeParser /;
  33: 
  34: sub get_token {
  35: 	my $self = shift;
  36: 	my $token = $self->SUPER::get_token( @_ );
  37: 	return unless defined $token;
  38: 	bless $token, 'XML::TokeParser::Token';
  39: }
  40: 
  41: sub get_tag {
  42: 	my $self = shift;
  43: 	my $token = $self->SUPER::get_tag( @_ );
  44: 	return unless defined $token;
  45: 	bless $token, 'XML::TokeParser::Token';
  46: }
  47: 
  48: package XML::TokeParser::Token;
  49: use strict;
  50: 
  51: =head1 What is XML::TokeParser::TOKEN
  52: 
  53: A token is a blessed array reference,
  54: that you acquire using $p->get_token,
  55: and that might look like:
  56: 
  57:     ["S",  $tag, $attr, $attrseq, $raw]
  58:     ["E",  $tag, $raw]
  59:     ["T",  $text, $raw]
  60:     ["C",  $text, $raw]
  61:     ["PI", $target, $data, $raw]
  62: 
  63: If you don't like remembering array indices,
  64: you may access the attributes of a token like:
  65: 
  66:     $p->get_token->tag,  $t->attr, $t->attrseg, $t->raw ...
  67: 
  68: =head2 Methods
  69: 
  70: Tokens  may be inspected using any of these is_* methods
  71: 
  72:     is_text
  73:     is_comment
  74:     is_pi
  75:     is_process_instruction
  76:     is_start_tag
  77:     is_end_tag
  78:     is_tag
  79: 
  80: like:
  81: 
  82:     print $t->target if $t->is_pi;
  83:     print "The comment says ", $t->text if $t->is_comment;
  84: 
  85: =cut
  86: 
  87: # for PI
  88: sub target  { return $_[0]->[1] if $_[0]->is_pi; }
  89: sub data    { return $_[0]->[2] if $_[0]->is_pi; }
  90: sub raw     { return $_[0]->[-1]; }
  91: 
  92: #for S
  93: sub attr    { return $_[0]->[2] if $_[0]->is_start_tag(); }
  94: sub attrseq { return $_[0]->[3] if $_[0]->is_start_tag(); }
  95: 
  96: #for S|E
  97: sub tag     { return $_[0]->[1] if $_[0]->is_tag; }
  98: 
  99: #for C|T
 100: sub text    { return $_[0]->[1] if $_[0]->is_text or $_[0]->is_comment; }
 101: 
 102: # test your token
 103: sub is_text                { return 1 if $_[0]->[0] eq 'T'; }
 104: sub is_comment             { return 1 if $_[0]->[0] eq 'C'; }
 105: sub is_pi                  { return 1 if $_[0]->[0] eq 'PI'; }
 106: sub is_process_instruction { goto &is_pi; }
 107: sub is_start_tag           { return $_[0]->_is( S => $_[1] ); }
 108: sub is_end_tag             { return $_[0]->_is( E => $_[1] ); }
 109: sub is_tag                 { return $_[0]->_is( S => $_[1] )
 110:                                  || $_[0]->_is( E => $_[1] ); }
 111: 
 112: sub _is {
 113:     if($_[0]->[0] eq $_[1]){
 114:         if(defined $_[2]){
 115:             return 1 if $_[0]->[1] eq $_[2];
 116:         }else{
 117:             return 1;
 118:         }
 119:     }
 120:     return 0;
 121: }
 122: 
 123: 1;
 124: 
 125: =head1 DEMO
 126: 
 127: execute this file as if it were a script, as in C<perl WhateverYouSavedItAs>,
 128: and you'll see how/that this module works.
 129: 
 130: =cut
 131: 
 132: package main;
 133: 
 134: unless(caller()){  
 135:     use Data::Dumper;
 136:     my $file = 'REC-xml-19980210.xml';
 137:        $file = \ q[<p>
 138: <scrap lang='ebnf' id='document'>
 139: <head>Document</head>
 140: <prod id='NT-document'><lhs>document</lhs>
 141: <rhs><nt def='NT-prolog'>prolog</nt> 
 142: <nt def='NT-element'>element</nt> 
 143: <nt def='NT-Misc'>Misc</nt>*</rhs></prod>
 144: </scrap>
 145: </p>];
 146: ## Cause chances are you won't have
 147: ## http://www.w3.org/TR/1998/REC-xml-19980210.xml
 148: ## as referenced in 
 149: ## http://www.xmltwig.com/article/ways_to_rome/ways_to_rome.html
 150: ## in the current directory
 151: 
 152: 
 153: 
 154:     my $i = 0;
 155:     my $p = XML::TokeParser::Simple->new($file);
 156: 
 157:     my $Ret = "";
 158: 
 159:     while(defined(my $t = $p->get_token() )){
 160:     
 161:         if( $t->is_start_tag('lhs') ){
 162:             $i++;
 163:             $Ret = join '', "[$i] ", $p->get_text('/lhs'), " ::= ";
 164:         }elsif( $t->is_start_tag('rhs') ){
 165:             $Ret .= $p->get_text('/rhs');
 166:         }elsif( $t->is_end_tag('prod') ){
 167:             print clean($Ret),"\n";
 168:             $Ret = "";
 169:         }
 170:     }
 171:     
 172:     undef $Ret;
 173:     undef $p;
 174:     
 175:     ## mirod already did this, so I'm borrowing
 176:     
 177:     sub prod {
 178:         my( $twig, $prod)= @_;
 179:         my $lhs= $prod->field( 'lhs');
 180:         my $rhs= join '', map {$_->text} $prod->children( 'rhs');
 181:     
 182:         $i++;
 183:         my $prod_text = "[$i] $lhs ::= $rhs";
 184:         print clean( $prod_text) . "\n";
 185:     }
 186:     
 187:     
 188:     sub clean { 
 189:             my( $string)= @_;
 190:             $string =~ s/\xc2\xa0/ /sg;
 191:             $string =~ s/\s+/ /g; $string=~ s{\s$}{}g;
 192:             return $string;
 193:     }
 194: }
 195: 
 196: 1;
 197: 
 198: =head1 SEE ALSO
 199: 
 200: L<XML::TokeParser>, L<HTML::TokeParser>, L<HTML::TokeParser::Simple>,
 201: L<XML::Twig>
 202: 
 203: =head1 AUTHOR
 204: 
 205: D.H. <PodMaster@cpan.org>
 206: 
 207: =head1 LICENSE
 208: 
 209: copyright (c) D.H. 2002 All rights reserved.
 210: 
 211: This program is released under the same terms as perl itself.
 212: If you don't know what that means, visit http://perl.com
 213: or execute "perl -v" at a commandline (assuming you have perl installed).
 214: 
 215: =cut

Replies are listed 'Best First'.
Re: XML::TokeParser::Simple - pretty much like HTML::TokeParser::Simple
by Ovid (Cardinal) on Nov 19, 2002 at 18:58 UTC

    I'm very envious. It looks like XML::TokeParser is a much easier module to work with. The tokens returned by HTML::TokeParser are slightly different in structure depending upon whether or not someone has generated a token via get_tag or get_token, thus forcing me to constantly synchronize the tokens internally and restore them to their original state when I'm done. I should be shot for having to write this:

    sub _synch_arrays { my $array_ref = shift; my $tag_func = GET_TOKEN; if ( ! grep { $array_ref->[0] eq $_ } keys %token ){ $tag_func = GET_TAG; if ( '/' ne substr $array_ref->[0], 0, 1 ) { unshift @$array_ref, 'S'; } else { unshift @$array_ref, 'E'; } } return ( $array_ref, $tag_func ); }

    I also think your code might be a bit cleaner than mine, too. Hmmm... maybe time for another update :( I also happened to break my $functions xor $methods rule. Whoops. I'm a hypocrite :)

    Cheers,
    Ovid

    New address of my CGI Course.
    Silence is Evil (feel free to copy and distribute widely - note copyright text)

      After all this time, and finally getting closer and closer to releasing XML::TokeParser (one which has this functionality built-in), I finally took another look at this thread and realized I too need to do something like that.

      D'oh.

      I mean, why would you get_tag and then test to see if it's a tag, or a process instruction, since it can only be a tag.

      I quickly fixed this and then I got reminded again that a XML::TokeParser::Token doesn't have a constructor -- yuck.

      Then I thought maybe I should force get_tag to return a proper token, but that would break backwards compatiblity, and I sure don't wanna do that.

      Then I think to myself I should forget all this nonsense, and have

      • XML::TokeParser::Token::StartTag
      • XML::TokeParser::Token::EndTag
      • XML::TokeParser::Token::PI
      • XML::TokeParser::Token::Comment
      • XML::TokeParser::Token::Text
      Might as well take full advantage of blessed references. Something like
      package XML::TokeParser::Token; sub is_text { return 0; } sub is_comment { return 0; } sub is_pi { return 0; } sub is_tag { return 0; } sub is_start_tag { return 0; } sub is_end_tag { return 0; } sub raw { return $_[0]->[-1]; } package XML::TokeParser::Token::Text; # use vars::i '@ISA' => 'XML::TokeParser::Token'; # i'll probably put +vars::i on cpan also use vars '@ISA'; @ISA = 'XML::TokeParser::Token'; sub is_text { return 1; } sub text { return $_[0]->[-2]; }
      Thoughts/Comments? I think maybe that's what i'll do, because
      sub is_end_tag { if( $_[0]->[0] eq 'E' or ( @{$_[0]} == 2 && substr( $_[0]->[0], 0, 1 ) eq '/' ) ){ if(defined $_[1]){ return 1 if $_[0]->[1] eq $_[1]; } else { return 1; } } return 0; }
      does not look so hot. *sigh*


      MJD says you can't just make shit up and expect the computer to know what you mean, retardo!
      I run a Win32 PPM repository for perl 5.6x+5.8x. I take requests.
      ** The Third rule of perl club is a statement of fact: pod is sexy.

        I've been thinking about that for a while. I was considering a few other options that I might want to toss in the code and somehow never quite get around to it. What you propose is a heck of a lot cleaner and will clear up some other issues. I guess I was the bad lazy. I hope you don't mind if I steal your code :)

        Incidentally, if you haven't seen it, HTML::TokeParser::Simple is now at version 2.1 and has three HTML munging methods added that cover some very common situations that people keep wanting to deal with.

        Cheers,
        Ovid

        New address of my CGI Course.
        Silence is Evil (feel free to copy and distribute widely - note copyright text)

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others about the Monastery: (4)
As of 2024-03-29 09:31 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found