Regex Titans? - format plaintext to HTML

HTML::TextToHTML

See also HTML::FromText (alt.). It isn't as recent but appears to do much the same thing.

--k.

In addition to treating lines that started with asterisks as bullets, I let bulleted (and numbered) lines span lines, turning

  * A line like
    this one.
[download]

A line like this one.

The approach I took was to build a state machine with a history stack. The essential parts of the code are below. Any #TBD comments reflect thoughts for newsthings to do/edge cases to handle. But the code got good enough for what I needed, and the comments were left in the code to puzzle future generations.

The top-level driver looks like

  sub format {
      my $self = shift;
      my $source = shift;

      $previndent = 0;
      $prevtype = 0;
      @indentstack = (0);
      @rulestack = (0);    # prime the pump with the null ruleset

      html();

      foreach ( split("\n", $source) ) {
          ($indent, $type, $line, $info) = classify_line($_);
          $indent = $previndent if $type == $BLANK;
    
          if ( not maybe_new_scope() ) {
              while ( maybe_exit_scope() ) {
              }
              maybe_new_scope();
          }

          maybe_emit_markup();
          emit($self->markup($line));
          $previndent = $indent;
          $prevtype = $type;
      }

      exit_scopes();

      my $html = html();

      return cleanup_html($html);
}
[download]

  sub classify_line { # return (indent, type, modified_line, info)
      local $_ = shift;

      s/^(\s*)$//          && return (length($1), $BLANK, $_, undef);
      s/^(\s*\*\s+)//      && return (length($1), $BULLET, $_, undef);
      s/^(\s*(\d+)\.\s+)// && return (length($1), $NUMBER, $_, $2);
      s/^(\s*)//           && return (length($1), $PLAIN, $_, undef);
      die "classify_line:\n\t" . $_ . "\n";
  }
[download]

sub maybe_new_scope {

#    if the "enter new scope" rule for the current scope fires then
#        find a scope whose "can enter" fires
#        push the scope
#        fire the "on enter" rule for the new scope

    my $ruleset = $rulesets{$rulestack[-1]};
    my $enter_rule = $ruleset->{maybe_push_scope};

    if ( &$enter_rule() ) {
        print "$rulestack[-1].enter_new_scope true\n" if $DEBUG;

        foreach my $rulesetname ( keys %rulesets ) {
            $ruleset = $rulesets{$rulesetname};
            my $can_enter_rule = $ruleset->{can_enter};
            if ( &$can_enter_rule() ) {
                print "$rulesetname.can_enter true\n" if $DEBUG;
        
                my $on_enter_rule = $ruleset->{on_enter};
                &$on_enter_rule();
        
                push @rulestack, $rulesetname;
                push @indentstack, $indent;
            }        
        }
    }

    return 0;
}

sub maybe_exit_scope {

#    if the "exit scope" rule for the current scope fires then
#        fire the "on exit" rule for the current scope
#        (pop)

    my $ruleset = $rulesets{$rulestack[-1]};
    my $maybe_exit_rule = $ruleset->{maybe_exit_scope};

    if ( &$maybe_exit_rule() ) {
        print "$rulestack[-1].maybe_exit_scope true\n" if $DEBUG;
    
        my $on_exit = $ruleset->{on_exit};
            &$on_exit();
    
        $prevtype = 0;    # reprime the pump for maybe_emit_markup
    
        pop @rulestack;
        pop @indentstack;
    
        return 1;
    }

    return 0;
}
[download]

my %rulesets = (
    # The "null" ruleset exists to force us into an initial scope
    0 => {
        can_enter =>
            sub { },
        on_enter =>
            sub { },
        maybe_emit_markup =>
            sub { },
        maybe_push_scope =>
            sub { 1 },
        maybe_exit_scope =>
            sub { },
        on_exit =>
            sub { }
     },

    # The "Plain" (P) ruleset handles normal, unindented paragraphs
    P => {
        can_enter =>
            sub { $indent == 0 },
        on_enter =>
            sub { },
        maybe_emit_markup =>
            sub {
                if ( $type == $BLANK ) {
                    emit("<br>\n") if $prevtype == $BLANK;
                }
                elsif ( $type == $PLAIN ) {
                    emit("<p>") if $prevtype != $PLAIN;
                }
            },
        maybe_push_scope =>
            sub { $indent > 0 },
        maybe_exit_scope =>
            sub { 0 },
        on_exit =>
            sub { }
    },

    # The "BQ" (Blockquote) ruleset handles indented paragraphs
    BQ => {
        can_enter =>
            sub {
                $type == $PLAIN &&
                $indent > $indentstack[-1] &&
                ($indent - $indentstack[-1]) < 8    # hack for <PRE>
            },
        on_enter =>
            sub {
                # emit("<p>") if $prevtype == $BLANK;
                emit("<blockquote>\n");
                $prevtype = 0;  # prime pump for maybe_emit_markup
            },
        maybe_emit_markup =>
            sub {
                if ( $type == $BLANK ) {
                    # emit("<br>\n") if $prevtype == $BLANK;
                }
                elsif ( $type == $PLAIN ) {
                    emit("<p>") if $prevtype == $BLANK; #TBD && $prevt
+ype != 0;
                }
            },
        maybe_push_scope =>
            sub { $indent > $indentstack[-1] },    #TBD fix me for <PR
+E>
        maybe_exit_scope =>
            sub {
                $type == $BULLET || 
                $type == $NUMBER ||
                $indent < $indentstack[-1]
            },
        on_exit =>
            sub { emit("</blockquote>\n") }
    },

    # The "UL" (Unordered List) ruleset handles bullet lists
    UL => {
        can_enter =>
            sub { $type == $BULLET &&
                  $indent > $indentstack[-1] &&
                  ($indent - $indentstack[-1]) < 8    # hack for <PRE>
            },
        on_enter =>
            sub { emit("<ul type=disc>\n") },
        maybe_emit_markup => 
            sub {
                 emit("<li>") if $type == $BULLET;
                 if ( $type == $BLANK ) {
                     emit("<br>") unless $prevtype == $BLANK;
                 emit("<br>");
                 }
             },
        maybe_push_scope =>
            sub { $indent > $indentstack[-1] },
        maybe_exit_scope =>
            sub {
                $type == $NUMBER ||
                $indent < $indentstack[-1]
            },
        on_exit =>
            sub { emit("</ul>\n") }
      },

    # The "OL" (Ordered List) ruleset handles numbered lists
    OL => {
        can_enter =>
            sub { $type == $NUMBER &&
                  $indent > $indentstack[-1] &&
                  ($indent - $indentstack[-1]) < 8    # hack for <PRE>
            },
        on_enter =>
            sub { emit("<ol>\n") },
        maybe_emit_markup => 
            sub {
                emit("<li>") if $type == $NUMBER;
                if ( $type == $BLANK ) {
                    emit("<br>") unless $prevtype == $BLANK;
                    emit("<br>");
                }
            },
        maybe_push_scope =>
            sub { $indent > $indentstack[-1] },
        maybe_exit_scope =>
            sub {
                $type == $BULLET ||
                $indent < $indentstack[-1]
            },
        on_exit =>
            sub { emit("</ol>\n") }
    },

    # The "PRE" (Preformatted) ruleset handles preformatted text
    PRE => {
        can_enter =>
            sub { $indent > $indentstack[-1] && $indent >= 8 }, #TBD h
+ack
           on_enter => sub { emit("<pre>") },
        maybe_emit_markup =>
            sub { emit(" " x ($indent - 8)) if $indent > 8 },
        maybe_push_scope =>
            sub { 0 },
        maybe_exit_scope =>
            sub { $indent < $indentstack[-1] },
        on_exit =>
            sub { emit("</pre>\n") },
    }
);
[download]

# The HTML that we've generated may need to be cleaned up. By deferrin
+g
# cleanup, the algorithms above can be simpler.
#
sub cleanup_html {
    my $html = shift;

    $html =~ s|<br><br>\n</ul>|</ul>|sg;
    $html =~ s|<br><br>\n</ol>|</ol>|sg;
    $html =~ s|<p>(<h\d>)|$1|g;
    $html =~ s|<p><hr|<hr|g;
    # $html =~ s|</blockquote>\n</blockquote>|</blockquote></blockquot
+e>|sg;
    $html =~ s|\n</pre>\n<p>|</pre>\n<p>|sg;

    return $html;
}
[download]

markup()

emit()

html()

[reply]
[d/l]
[select]

[reply]

s{^\s*[-?](.*)$}{<ul><li>$1</li></ul>}mg; # note the m rather than s m
+odifier
s/([^>])$/$1<br>/mg; # add <br> only where no closing tag at EOL
s{</ul>\n<ul>}{\n}sg; # cleanup
[download]

____________

Makeshifts last the longest.

[reply]
[d/l]


Don't ask to ask, just ask
	PerlMonks