Re: wiki regex reprocessing replacement

Here's my take. One thing I don't understand is the inclusion of > < characters in the pre- and post-markup tag delimiters (update: e.g., my $pre = qr/(^|\s|>)/; here), probably because I'm not familiar with wikisyntax. Can you link me to a discussion of the role of these characters? I prepared two versions, one using (?(DEFINE) ...) and one based purely on qr// interpolation. Maybe one is faster, but I haven't done any Benchmark-ing (nor am I likely to).

File W2H_1.pm:

# W2H_1.pm                                                  15feb20waw

package W2H_1;

use 5.010;  # need regex extensions

use strict;
use warnings;

# use Data::Dump qw(pp dd);  # for debug

use constant DEBUG => 0;
use constant {
    DBPR_rx => 0 && DEBUG,
    };

my %w2h_map = qw(* b   / i   _ u);  # wiki markup -> HTML tag mapping

my $mq_wiki_mark = quotemeta join '', keys %w2h_map;  # meta-quoted wi
+ki

my $rx_wml = qr{
  ((?&PRE_OPEN) (?&WIKI_MARK))                            # $1: openin
+g wikimark
  ((?: (?: (?: (?! \g-2 (?&POST_CLOSE)) .)*+) | (?R) )*)  # $2: marked
+ body
  \g-2 (?&POST_CLOSE)                                     # closing wi
+kimark

  (?(DEFINE)
    (?<PRE_OPEN>   (?<! [^>\s$mq_wiki_mark]))  # after string start, >
+ or \s or wml
    (?<POST_CLOSE> (?!  [^<\s$mq_wiki_mark]))  # before string end,  <
+ or \s or wml
    (?<WIKI_MARK>  [\Q$mq_wiki_mark\E])
    )

  }xms;
DBPR_rx and print $rx_wml, "\n";

# only used for development diagnostics
my $rx_pre_open   = qr{ (?<! [^\s>]) }xms;
my $rx_post_close = qr{ (?!  [^\s<]) }xms;
my $rx_wiki_mark  = qr{ [$mq_wiki_mark] }xms;

# only used for development diagnostics
die q{bad '\A wm' match} unless '_'  =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;
die q{bad '\s wm' match} unless ' _' =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;
die q{bad '> wm'  match} unless '>_' =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;

die q{bad 'wm \z' match} unless '_'  =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;
die q{bad 'wm \s' match} unless '_ ' =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;
die q{bad 'wm <'  match} unless '_<' =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;

# subroutines ######################################################

sub w2h {  # not exported; invoke fully-qualified

    my ($wiki,  # string possibly containing wiki markup
        ) = @_;

    (my $html = $wiki) =~ s{
        $rx_wml  # captures to $1 $2
        }
        { my ($html_tag, $body) = ($w2h_map{$1}, w2h($2));
          "<$html_tag>$body</$html_tag>";
        }xmsge;

    return $html;

    }

1;
[download]

File W2H_2.pm:

# W2H_2.pm                                                  15feb20waw

package W2H_2;

use 5.010;  # need regex extensions

use strict;
use warnings;

# use Data::Dump qw(pp dd);  # for debug

use constant DEBUG => 0;
use constant {
    DBPR_rx => 0 && DEBUG,
    };

my %w2h_map = qw(* b   / i   _ u);  # wiki markup -> HTML tag mapping

my $mq_wiki_mark = quotemeta join '', keys %w2h_map;  # meta-quoted wi
+ki

my $rx_pre_open   = qr{ (?<! [^>\s$mq_wiki_mark]) }xms;
my $rx_post_close = qr{ (?!  [^<\s$mq_wiki_mark]) }xms;
my $rx_wiki_mark  = qr{ [$mq_wiki_mark] }xms;

my $rx_wml = qr{
  ($rx_pre_open $rx_wiki_mark)                            # $1: openin
+g wikimark
  ((?: (?: (?: (?! \g-2 $rx_post_close) .)*+) | (?R) )*)  # $2: marked
+ body
  \g-2 $rx_post_close                                     # closing wi
+kimark
  }xms;
DBPR_rx and print $rx_wml, "\n";

# only used for development diagnostics
die q{bad '\A wm' match} unless '_'  =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;
die q{bad '\s wm' match} unless ' _' =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;
die q{bad '> wm'  match} unless '>_' =~ m{ $rx_pre_open   $rx_wiki_mar
+k  }xms;

die q{bad 'wm \z' match} unless '_'  =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;
die q{bad 'wm \s' match} unless '_ ' =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;
die q{bad 'wm <'  match} unless '_<' =~ m{ $rx_wiki_mark  $rx_post_clo
+se }xms;

# subroutines ######################################################

sub w2h {  # not exported; invoke fully-qualified

    my ($wiki,  # string possibly containing wiki markup
        ) = @_;

    (my $html = $wiki) =~ s{
        $rx_wml  # captures to $1 $2
        }
        { my ($html_tag, $body) = ($w2h_map{$1}, w2h($2));
          "<$html_tag>$body</$html_tag>";
        }xmsge;

    return $html;

    }

1;
[download]

File W2H.t:

# W2H.t                                                     15feb20waw

use strict;
use warnings;

# use Data::Dump qw(pp dd);  # for debug

use Test::More 'no_plan';
use Test::NoWarnings;

BEGIN {
    use_ok 'W2H_1';
    use_ok 'W2H_2';
    }

use constant DEBUG => 1;
use constant {
    DBPR_m_d1  => 0 && DEBUG,
    };

my @Tests = (

  [   '_abcd_'   =>       '<u>abcd</u>',         ],
  [  '_/abcd/_'  =>    '<u><i>abcd</i></u>',     ],
  [ '*_/abcd/_*' => '<b><u><i>abcd</i></u></b>', ],

  [  '*_/a*b_c/d/_*'  =>  '<b><u><i>a*b_c/d</i></u></b>',  ],
  [ ' *_/a*b_c/d/_* ' => ' <b><u><i>a*b_c/d</i></u></b> ', ],

  [ '_abc__def_' => '<u>abc</u><u>def</u>', ],
  [ '_abc__def_*ghi**jkl*/mno//pqr/'
    =>
    '<u>abc</u><u>def</u><b>ghi</b><b>jkl</b><i>mno</i><i>pqr</i>',
    ],
  [ ' _abc__def_*ghi**jkl*/mno//pqr/ '
    =>
    ' <u>abc</u><u>def</u><b>ghi</b><b>jkl</b><i>mno</i><i>pqr</i> ',
    ],
  [ '_abc_ _def_ *ghi* *jkl* /mno/ /pqr/'
    =>
    '<u>abc</u> <u>def</u> <b>ghi</b> <b>jkl</b> <i>mno</i> <i>pqr</i>
+',
    ],

  [  '_abc_ /def/'  =>  '<u>abc</u> <i>def</i>',  ],
  [ ' _abc_ /def/ ' => ' <u>abc</u> <i>def</i> ', ],

  [ 'x _abc_ /def/ x' => 'x <u>abc</u> <i>def</i> x', ],
  [ '_abc /xyz/ cba_ /def/' => '<u>abc <i>xyz</i> cba</u> <i>def</i>',
+ ],
  [ '_/one *t/w*o*/ th/r_ee_ null' => '<u><i>one <b>t/w*o</b></i> th/r
+_ee</u> null', ],

  [ '_/one *two*/ th/ree_ null _/f*ur *five*/ six_  null _/se_ven *eig
+ht*/ nine_ *fail_',
    =>
    '<u><i>one <b>two</b></i> th/ree</u> null <u><i>f*ur <b>five</b></
+i> six</u>  null <u><i>se_ven <b>eight</b></i> nine</u> *fail_',
    'from pm#11112991'
    ],

    );

# testing, testing...

FUNT:
for my $func_name (
    'W2H_1::w2h',  'W2H_2::w2h',
    ) {

    note "\n=== testing $func_name() ===\n\n";
    *w2h = do { no strict 'refs';  *$func_name; };

    VECTOR:
    for my $ar_vector (@Tests) {

        if (not ref $ar_vector) {
            note $ar_vector;
            next VECTOR;
            }

        my ($wiki, $expected, $comment) = @$ar_vector;
        $comment = defined $comment ? "$comment: " : '';

        my $got = w2h($wiki);

        DBPR_m_d1 and diag ":$wiki:";
        DBPR_m_d1 and diag ":$got:";

        is $got, $expected, "${comment}'$wiki' -> '$expected'";

        }  # end for VECTOR

    }  # end for FUNT

done_testing;

exit;

# subroutines ######################################################

# none for now
[download]

Give a man a fish: <%-{-{-{-<

Comment on Re: wiki regex reprocessing replacement Select or Download Code

Replies are listed 'Best First'.
Re^2: wiki regex reprocessing replacement by LanX (Saint) on Feb 16, 2020 at 11:52 UTC
Wow, thanks :) And the test suite ++ > One thing I don't understand is the inclusion of > < characters in the pre- and post-markup tag delimiters Because the repetitive solution with tf() needs to ignore previous runs. `/_word_/ -> <b>/_word_/</b> -> <b><i>_word_</i></b> -> etc.` The recursive solution with rec() doesn't really need it, one of the reasons why I prefer this approach. > probably because I'm not familiar with wikisyntax. No you are not wrong, there was information missing. In this particular case the syntax is also meant to coexist with more verbose html tags. There are cases where one doesn't want to have a whitespace in between neighboring tags. Just compare Re^3: Good Intentions: Wikisyntax for the Monastery and the complaint about `'ARGV'<br>` not expanding. Cheers Rolf _{(addicted to the Perl Programming Language :) Wikisyntax for the Monastery}	[reply] [d/l] [select]
Re^3: wiki regex reprocessing replacement by AnomalousMonk (Archbishop) on Feb 16, 2020 at 23:47 UTC
You're welcome. Here are some test cases I've added since posting. I'd be very interested to hear your comments, especially as regards the "questionable" ones. '--- tests added 16feb20 after pm#11113014 post ---', '"failing" (i.e., no transformation) tests', [ '' => '', ], [ '' => '', ], [ '_/' => '_/', ], [ ' * _ / ' => ' * _ / ', ], [ 'fail/' => 'fail/', ], [ ' * fail / ' => ' * fail / ', ], 'possibly questionable transformations', [ '__' => '<u></u>', ], [ ' __ ' => ' <u></u> ', ], [ '__ __' => '<u></u> <u></u>', ], [ ' __ __ ' => ' <u></u> <u></u> ', ], [ '____' => '<u></u><u></u>', '???' ], [ ' ____ ' => ' <u></u><u></u> ', '???' ], [ '______' => '<u></u><u></u><u></u>', '???' ], [ ' ______ ' => ' <u></u><u></u><u></u> ', '???' ], [ '________' => '<u></u><u></u><u></u><u></u>', '???' ], [ ' ________ ' => ' <u></u><u></u><u></u><u></u> ', '???' ], [ '__ __ __ __' => '<u></u> <u></u> <u></u> <u></u>', ], [ ' __ __ __ __ ' => ' <u></u> <u></u> <u></u> <u></u> ', ], [download] In this particular case the syntax is also meant to coexist with more verbose html tags. There are cases where one doesn't want to have a whitespace in between neighboring tags. Can you supply some test cases for variations, especially WRT intermixtures with standard HTML? Give a man a fish: `<%-{-{-{-<`	[reply] [d/l] [select]
Re^4: wiki regex reprocessing replacement by LanX (Saint) on Feb 17, 2020 at 00:55 UTC
> especially as regards the "questionable" ones Yes sorry. I didn't want to over complicate the question, and just wrote `.?` between the markup. Actually I'm using now something like `(\S.?(?<=\S))` to enforce at least one non-whitespace between the markers. The objective of the question was *"How best to allow ` / _` to be chained and or nested".** The recursive approach does it already pretty well. And actually nesting these markups is of rather low priority in the to-do list > Can you supply some test cases for variations, especially WRT intermixtures with standard HTML? That's my project: Wikisyntax for the Monastery =) JS-regex is mostly compatible to Perl4 regex. these are some tests I use ATM sub is_tf { my ($in,$out,$label) = @_; is( rec( $in ) => $out => "$label: \t'$in'\t->\t'$out'" ); } sub no_tf { my ($in,$label) = @_; is_tf($in,$in,$label); } no_tf( '*' => "no letter" ); is_tf( 'A' => '<b>A</b>' => "one letter"); is_tf( 'A B' ,'<b>A B</b>' , "multi word"); no_tf( ' A' , "before non-whitespace"); no_tf( 'A ' , "after non-whitespace"); no_tf( "A\nB" , "line break"); is_tf( 'A B' ,'<b>A B</b>' , "after non-whitespace prolonged"); is_tf( '//' ,'<i>*</i>' , "nested no letter"); [download] Cheers Rolf _{(addicted to the Perl Programming Language :) Wikisyntax for the Monastery}	[reply] [d/l] [select]


Do you know where your variables are?
	PerlMonks