http://qs321.pair.com?node_id=164513
Description: Speedy XML-like parser (not 100% compliant). It takes a funky approach and matches against substrs instead of gathering in the match, this made it over 20x faster in my test case.

This is going into a module that needs to be very light weight, which more or less precludes dependencies. XML::Parser::Lite isn't available by itself, and it along with XML::SAX::PurePerl have pretty hefty interfaces.

Fixed issues

  • <foo/>
  • <foo bar="1" baz="=" quux="don't"/>
  • <foo><bar/>quux</foo>
  • <!-- <foo/> -->
  • Last Modified: Sat May 11 04:15:44 UTC 2002

    UPDATE: Please see XML::RSSLite for further updates.

    =pod
    
    =head1 SYNOPSIS
    
    parse(\%parsedTree, \$parseThis, 'topTag', $comments);
    
    =head1 DESCRIPTION
    
    =over
    
    =item parsedTree - required
    
    reference to hash to store the parsed document within
    
    =item parseThis  - required
    
    reference to scalar containg the document to parse
    
    =item topTag     - optional
    
    tag to consider the root node, leaving this undefined is not recommend
    +ed.
    
    =item comments   - optional
    
    =over
    
    =item false will remove contents from
    
    =item true will not remove comments
    
    =item array reference is true, comments are stored here
    
    =back
    
    =back
    
    =head1 CAVEATS
    
    Not a conforming parser, it does not handle the following
    
    =over
    
    =item <foo bar=">">
    
    =item <foo><bar> <bar></bar> <bar></bar> </bar></foo>
    
    =item <![CDATA[ ]]>
    
    =item PI
    
    =back
    
    It's non-validating, without a DTD the following cannot be addressed
    
    =over
    
    =item entities
    
    =item namespace
    
    =back
    
    =cut
    
    sub parseXML{
      my($hash, $xml, $tag, $comments) = @_;
      my($begin, $end, @comments);
      local $_;
    
      #Kill comments
      while( ($begin =  index(${$xml}, '<!--')) > -1 &&
        ${$xml} =~ m%<!--.*?--(>)%sg ){
        my $str = substr(${$xml}, $begin, pos(${$xml})-$begin, '');
        
        #Save them if requested
        do{ unshift @comments, [$begin, substr($str, 4, length($str)-7)] }
          if $comments;
      }
    
    
      #Find topTag and set pos to start matching from there
      ${$xml} =~ /<$tag(?:>|\s)/g;
      ($begin, $end) = (0, pos(${$xml})||0);
    
      #Match either <foo></foo> or <bar />, optional attributes, stash tag
    + name
      while( ${$xml} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?</\1)>%sg ){   
    +  
    
        #Save the tag name, we'll need it
        $tag = $1 || $2;
    
        #Save the new beginning and end
        ($begin, $end) = ($end, pos(${$xml}));
    
        #Get the bit we just matched.
        my $str = substr(${$xml}, $begin, $end-$begin);
        
        #Extract the actual attributes and contents of the tag
    #   $str =~ m%<$tag\s*([^>]*?)?>(.*?)</$tag>%s ||
        $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)</$tag>%<$tag>$2</$tag>%s ||
          $str =~ m%<$tag\s*([^>]*?)?\s*/>%;
        my($attr, $content) = ($1, $2);
    
        #Did we get attributes? clean them up and chuck them in a hash.
        if( $attr ){
          ($_, $attr) = ($attr, {});
          $attr->{$1} = $3 while m/([^\s=]+)\s*=\s*(['"])(.*?)\2/g;
        }
    
        #Recurse if contents has more tags, replace contents with referenc
    +e we get
        if( $content && index($content, '<') > -1 ){
          parseXML($content={}, \$str, $tag, 0);
          #Was there any data in the contents? We should extract that...
    #      if( $str =~ />[^><\s]+</ || $str =~ />(?:[^><\s]+\s+)+</ ){
          if( $str =~ />[^><]+</ ){
        #The odd RE above \S+\s+ shortcircuits unnecessary entry
    
        my $length = length($str);
        my $taglen = length($tag)+2;
        $str= substr($str, $taglen, $length-1-2*$taglen);
    
        #Clean whitespace between tags
        #$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning
        #$str =~ s%(>?)\s*<%$1<%g;
    
    #    $str =~ s%<$_\s*(?:[^>]*?)?(?:/|>.*?</$_)>%%sg for keys %{$conten
    +t};
        my $qr = qr{@{[join('|', keys %{$content})]}};
        $str =~ s%<($qr)\s*(?:[^>]*?)?(?:/|>.*?</\1)>%%sg;
    
        $content->{'<>'} = $str;#XXX if $str;
          }
        }
    
        my($inhash);
        if( ref($content) ){
          #We have attributes? Then we should save them.
          $inhash = $attr || {};
    
          #Contents too? Save them as well.
          if( $content ){
        for( keys %{$content} ){
          $inhash->{$_} = exists($inhash->{$_})   ?
            (ref($inhash->{$_})  eq 'ARRAY'       ?
             [@{$inhash->{$_}}, $content->{$_}]   :
             [  $inhash->{$_},  $content->{$_}] ) : $content->{$_};
        }
        
          }
        }
        else{
          #Otherwise save our content
          $inhash = $content;
        }
        
        $hash->{$tag} = exists($hash->{$tag}) ?
          (ref($hash->{$tag})  eq 'ARRAY'     ?
        [@{$hash->{$tag}}, $inhash]       :
        [  $hash->{$tag},  $inhash]  )    : $inhash;
      }
    
      if( $comments ){
        #Restore comments if requested
        substr(${$xml}, $_->[0], 0, '<!--'.$_->[1].'-->') for @comments;
    
        #Expose comments if requested
        do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq
    + 'ARRAY';
      }
    }