=head1 B my $scrubbed_text = scrub_input($raw_text, [$handlers]); This takes a string and an optional ref to a hash of handlers, and returns html-escaped output, except for the sections handled by the handlers which do whatever they want. If handlers are passed their names should be lower case and start with a character matching [^\w\s\d] or else they will not be matched properly. While parsing the string, when they can be matched case insensitively, then the handler is called. It will be passed a reference to $raw_text right after that matches the name of the handler. (pos($raw_text) will point to the end of the name.) It should return the text to be inserted into the output, and set pos($raw_text) to where to continue parsing from, or 0 if no text was handled. Two special handlers that may be used are "pre" and "post" that will be called before and after (respectively) the raw text is processed. For consistency they also get a reference to the raw text. If no handler is passed, it will use \%handlers instead. =cut sub scrub_input { my $raw = shift; local @open_tags; my $handler = shift || \%handlers; $handler->{pre} ||= sub {return '';}; $handler->{post} ||= sub { return join '', map "", reverse @open_tags; }; my $scrubbed = $handler->{pre}->(\$raw); # This would be faster with the trie code from node 30896. But # that is not the point of this example so I have not done that. # Also note the next line is meant to force an NFA engine to # match the longest alternative first my $re_str = join "|", map {quotemeta} reverse sort keys %$handler; my $is_handled = qr/$re_str/i; while ($raw =~ /\G([\w\d\s]*)/gi) { $scrubbed .= $1; my $pos = pos($raw); if ($raw =~ /\G($is_handled)/g) { $scrubbed .= $handler->{ lc($1) }->(\$raw); } unless (pos($raw)) { if (length($raw) == $pos) { # EXIT HERE # return $scrubbed . $handler->{post}->(\$raw); } else { my $char = substr($raw, $pos, 1); pos($raw) = $pos + 1; $scrubbed .= &encode_entities($char); } } } confess("I have no idea how I got here!"); }