OK, this took me a lot longer than I expected it to, and the algorithm ended up a little convoluted, but I think this better handles a few edge cases...
use v5.14;
use strictures;
package Parser {
use Moo 1.006000;
use Types::Standard qw( RegexpRef ArrayRef );
use Text::Balanced qw( extract_bracketed );
use HTML::Entities qw( encode_entities );
use namespace::autoclean;
my $Allowance = RegexpRef->plus_coercions(
ArrayRef, sub { qr/${\( join "|", map quotemeta, @$_ )}/ },
);
has allowed_tags => (
is => 'ro',
isa => $Allowance,
coerce => 1,
builder => sub {
[qw(A ABBR ACRONYM B BIG CITE CODE DFN EM I KBD Q SAMP
SMALL SPAN STRONG SUB SUP TT VAR)]
},
);
sub print {
my $self = shift;
$self = $self->new unless ref $self;
print $self->parse($_) for @_;
}
sub parse
{
my $self = shift;
my ($text) = @_;
my $tags = $self->allowed_tags;
my ($before, $match) = ($text =~ m{
\A # start of string
(.*?) # leading text ($before)
( # either...
\<\!-- # the start of a comment
| # or...
$tags\< # a tag
)
}xsm) or do {
my @return = split /\|/, $text;
$return[0] = encode_entities($return[0]);
return @return;
};
# strip $before from $text
substr($text, 0, length($before)) = '';
# If the first thing that needed to be handled was a comment
if ($match eq '<!--') {
# Strip it out
$text =~ s/\<\!--(.+?)--\>//g;
# Handle the rest via recursion
return join "", $before, $self->parse($text);
}
chop(my $found_tag = lc $match);
substr($text, 0, length($found_tag)) = '';
my ($got, $remainder) = extract_bracketed($text, q/<"'>/);
$got = substr($got, 1, length($got) - 2);
my ($markup, @attrs) = $self->parse($got);
my ($more_markup, @more_attrs) = $self->parse($remainder);
$_ //= '' for $markup, $more_markup;
join("",
$before,
(@attrs ? "<$found_tag @attrs>" : "<$found_tag>"),
$markup,
"</$found_tag>",
$more_markup,
), @more_attrs;
}
}
Parser->print(<<'TEXT');
Anyone who watches the Syfy channel knows that on
Monday nights they aired three television series
I<A<EurSUP<e>ka|href="Movies_by_series.pl?series=EWA#EUReKA">|class="t
+itle">,
I<A<Warehouse & 13|href="Movies_by_series.pl?series=EWA#Warehouse_13">
+>,
and I<A<Alphas|href="Movies_by_series.pl?series=EWA#Alphas">>.
Some might not be aware that these three series have formed a crossove
+r
cosmology which I call A<EWA|href="Movies_by_series.pl?series=EWA">
<!-- This is a long string. -->
TEXT