I attacked a similar problem several years ago (pre-Perlmonks), and coded up my own solution
after looking at the few packages available at the time.
In addition to treating lines that started with asterisks as bullets,
I let bulleted (and numbered) lines span lines, turning
* A line like
this one.
into
The approach I took was to build a state machine with a history stack. The essential parts of the code are below. Any #TBD comments
reflect thoughts for newsthings to do/edge cases to handle. But the code got good
enough for what I needed, and the comments were left in the code to puzzle future generations.
The top-level driver looks like
sub format {
my $self = shift;
my $source = shift;
$previndent = 0;
$prevtype = 0;
@indentstack = (0);
@rulestack = (0); # prime the pump with the null ruleset
html();
foreach ( split("\n", $source) ) {
($indent, $type, $line, $info) = classify_line($_);
$indent = $previndent if $type == $BLANK;
if ( not maybe_new_scope() ) {
while ( maybe_exit_scope() ) {
}
maybe_new_scope();
}
maybe_emit_markup();
emit($self->markup($line));
$previndent = $indent;
$prevtype = $type;
}
exit_scopes();
my $html = html();
return cleanup_html($html);
}
Lines are "classified" by applying a set of regular expressions
sub classify_line { # return (indent, type, modified_line, info)
local $_ = shift;
s/^(\s*)$// && return (length($1), $BLANK, $_, undef);
s/^(\s*\*\s+)// && return (length($1), $BULLET, $_, undef);
s/^(\s*(\d+)\.\s+)// && return (length($1), $NUMBER, $_, $2);
s/^(\s*)// && return (length($1), $PLAIN, $_, undef);
die "classify_line:\n\t" . $_ . "\n";
}
The decisions to enter and exit "scopes" are handled by
sub maybe_new_scope {
# if the "enter new scope" rule for the current scope fires then
# find a scope whose "can enter" fires
# push the scope
# fire the "on enter" rule for the new scope
my $ruleset = $rulesets{$rulestack[-1]};
my $enter_rule = $ruleset->{maybe_push_scope};
if ( &$enter_rule() ) {
print "$rulestack[-1].enter_new_scope true\n" if $DEBUG;
foreach my $rulesetname ( keys %rulesets ) {
$ruleset = $rulesets{$rulesetname};
my $can_enter_rule = $ruleset->{can_enter};
if ( &$can_enter_rule() ) {
print "$rulesetname.can_enter true\n" if $DEBUG;
my $on_enter_rule = $ruleset->{on_enter};
&$on_enter_rule();
push @rulestack, $rulesetname;
push @indentstack, $indent;
}
}
}
return 0;
}
sub maybe_exit_scope {
# if the "exit scope" rule for the current scope fires then
# fire the "on exit" rule for the current scope
# (pop)
my $ruleset = $rulesets{$rulestack[-1]};
my $maybe_exit_rule = $ruleset->{maybe_exit_scope};
if ( &$maybe_exit_rule() ) {
print "$rulestack[-1].maybe_exit_scope true\n" if $DEBUG;
my $on_exit = $ruleset->{on_exit};
&$on_exit();
$prevtype = 0; # reprime the pump for maybe_emit_markup
pop @rulestack;
pop @indentstack;
return 1;
}
return 0;
}
The "ruleset" looks like
my %rulesets = (
# The "null" ruleset exists to force us into an initial scope
0 => {
can_enter =>
sub { },
on_enter =>
sub { },
maybe_emit_markup =>
sub { },
maybe_push_scope =>
sub { 1 },
maybe_exit_scope =>
sub { },
on_exit =>
sub { }
},
# The "Plain" (P) ruleset handles normal, unindented paragraphs
P => {
can_enter =>
sub { $indent == 0 },
on_enter =>
sub { },
maybe_emit_markup =>
sub {
if ( $type == $BLANK ) {
emit("<br>\n") if $prevtype == $BLANK;
}
elsif ( $type == $PLAIN ) {
emit("<p>") if $prevtype != $PLAIN;
}
},
maybe_push_scope =>
sub { $indent > 0 },
maybe_exit_scope =>
sub { 0 },
on_exit =>
sub { }
},
# The "BQ" (Blockquote) ruleset handles indented paragraphs
BQ => {
can_enter =>
sub {
$type == $PLAIN &&
$indent > $indentstack[-1] &&
($indent - $indentstack[-1]) < 8 # hack for <PRE>
},
on_enter =>
sub {
# emit("<p>") if $prevtype == $BLANK;
emit("<blockquote>\n");
$prevtype = 0; # prime pump for maybe_emit_markup
},
maybe_emit_markup =>
sub {
if ( $type == $BLANK ) {
# emit("<br>\n") if $prevtype == $BLANK;
}
elsif ( $type == $PLAIN ) {
emit("<p>") if $prevtype == $BLANK; #TBD && $prevt
+ype != 0;
}
},
maybe_push_scope =>
sub { $indent > $indentstack[-1] }, #TBD fix me for <PR
+E>
maybe_exit_scope =>
sub {
$type == $BULLET ||
$type == $NUMBER ||
$indent < $indentstack[-1]
},
on_exit =>
sub { emit("</blockquote>\n") }
},
# The "UL" (Unordered List) ruleset handles bullet lists
UL => {
can_enter =>
sub { $type == $BULLET &&
$indent > $indentstack[-1] &&
($indent - $indentstack[-1]) < 8 # hack for <PRE>
},
on_enter =>
sub { emit("<ul type=disc>\n") },
maybe_emit_markup =>
sub {
emit("<li>") if $type == $BULLET;
if ( $type == $BLANK ) {
emit("<br>") unless $prevtype == $BLANK;
emit("<br>");
}
},
maybe_push_scope =>
sub { $indent > $indentstack[-1] },
maybe_exit_scope =>
sub {
$type == $NUMBER ||
$indent < $indentstack[-1]
},
on_exit =>
sub { emit("</ul>\n") }
},
# The "OL" (Ordered List) ruleset handles numbered lists
OL => {
can_enter =>
sub { $type == $NUMBER &&
$indent > $indentstack[-1] &&
($indent - $indentstack[-1]) < 8 # hack for <PRE>
},
on_enter =>
sub { emit("<ol>\n") },
maybe_emit_markup =>
sub {
emit("<li>") if $type == $NUMBER;
if ( $type == $BLANK ) {
emit("<br>") unless $prevtype == $BLANK;
emit("<br>");
}
},
maybe_push_scope =>
sub { $indent > $indentstack[-1] },
maybe_exit_scope =>
sub {
$type == $BULLET ||
$indent < $indentstack[-1]
},
on_exit =>
sub { emit("</ol>\n") }
},
# The "PRE" (Preformatted) ruleset handles preformatted text
PRE => {
can_enter =>
sub { $indent > $indentstack[-1] && $indent >= 8 }, #TBD h
+ack
on_enter => sub { emit("<pre>") },
maybe_emit_markup =>
sub { emit(" " x ($indent - 8)) if $indent > 8 },
maybe_push_scope =>
sub { 0 },
maybe_exit_scope =>
sub { $indent < $indentstack[-1] },
on_exit =>
sub { emit("</pre>\n") },
}
);
This process generates less-than-ideal HTML, requiring cleanup.
# The HTML that we've generated may need to be cleaned up. By deferrin
+g
# cleanup, the algorithms above can be simpler.
#
sub cleanup_html {
my $html = shift;
$html =~ s|<br><br>\n</ul>|</ul>|sg;
$html =~ s|<br><br>\n</ol>|</ol>|sg;
$html =~ s|<p>(<h\d>)|$1|g;
$html =~ s|<p><hr|<hr|g;
# $html =~ s|</blockquote>\n</blockquote>|</blockquote></blockquot
+e>|sg;
$html =~ s|\n</pre>\n<p>|</pre>\n<p>|sg;
return $html;
}
That's the general idea, with a few minor details left out. (markup() handles markup within a line (including verifying a safe subset of HTML), emit() accumulates an HTML string, which html() returns, etc.)
|