in reply to wiki regex reprocessing replacement
Here's my take. One thing I don't understand is the inclusion of > < characters in the pre- and post-markup tag delimiters (update: e.g., my $pre = qr/(^|\s|>)/; here), probably because I'm not familiar with wikisyntax. Can you link me to a discussion of the role of these characters? I prepared two versions, one using (?(DEFINE) ...) and one based purely on qr// interpolation. Maybe one is faster, but I haven't done any Benchmark-ing (nor am I likely to).
File W2H_1.pm:
# W2H_1.pm 15feb20waw
package W2H_1;
use 5.010; # need regex extensions
use strict;
use warnings;
# use Data::Dump qw(pp dd); # for debug
use constant DEBUG => 0;
use constant {
DBPR_rx => 0 && DEBUG,
};
my %w2h_map = qw(* b / i _ u); # wiki markup -> HTML tag mapping
my $mq_wiki_mark = quotemeta join '', keys %w2h_map; # meta-quoted wi
+ki
my $rx_wml = qr{
((?&PRE_OPEN) (?&WIKI_MARK)) # $1: openin
+g wikimark
((?: (?: (?: (?! \g-2 (?&POST_CLOSE)) .)*+) | (?R) )*) # $2: marked
+ body
\g-2 (?&POST_CLOSE) # closing wi
+kimark
(?(DEFINE)
(?<PRE_OPEN> (?<! [^>\s$mq_wiki_mark])) # after string start, >
+ or \s or wml
(?<POST_CLOSE> (?! [^<\s$mq_wiki_mark])) # before string end, <
+ or \s or wml
(?<WIKI_MARK> [\Q$mq_wiki_mark\E])
)
}xms;
DBPR_rx and print $rx_wml, "\n";
# only used for development diagnostics
my $rx_pre_open = qr{ (?<! [^\s>]) }xms;
my $rx_post_close = qr{ (?! [^\s<]) }xms;
my $rx_wiki_mark = qr{ [$mq_wiki_mark] }xms;
# only used for development diagnostics
die q{bad '\A wm' match} unless '_' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad '\s wm' match} unless ' _' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad '> wm' match} unless '>_' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad 'wm \z' match} unless '_' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
die q{bad 'wm \s' match} unless '_ ' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
die q{bad 'wm <' match} unless '_<' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
# subroutines ######################################################
sub w2h { # not exported; invoke fully-qualified
my ($wiki, # string possibly containing wiki markup
) = @_;
(my $html = $wiki) =~ s{
$rx_wml # captures to $1 $2
}
{ my ($html_tag, $body) = ($w2h_map{$1}, w2h($2));
"<$html_tag>$body</$html_tag>";
}xmsge;
return $html;
}
1;
File W2H_2.pm:
# W2H_2.pm 15feb20waw
package W2H_2;
use 5.010; # need regex extensions
use strict;
use warnings;
# use Data::Dump qw(pp dd); # for debug
use constant DEBUG => 0;
use constant {
DBPR_rx => 0 && DEBUG,
};
my %w2h_map = qw(* b / i _ u); # wiki markup -> HTML tag mapping
my $mq_wiki_mark = quotemeta join '', keys %w2h_map; # meta-quoted wi
+ki
my $rx_pre_open = qr{ (?<! [^>\s$mq_wiki_mark]) }xms;
my $rx_post_close = qr{ (?! [^<\s$mq_wiki_mark]) }xms;
my $rx_wiki_mark = qr{ [$mq_wiki_mark] }xms;
my $rx_wml = qr{
($rx_pre_open $rx_wiki_mark) # $1: openin
+g wikimark
((?: (?: (?: (?! \g-2 $rx_post_close) .)*+) | (?R) )*) # $2: marked
+ body
\g-2 $rx_post_close # closing wi
+kimark
}xms;
DBPR_rx and print $rx_wml, "\n";
# only used for development diagnostics
die q{bad '\A wm' match} unless '_' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad '\s wm' match} unless ' _' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad '> wm' match} unless '>_' =~ m{ $rx_pre_open $rx_wiki_mar
+k }xms;
die q{bad 'wm \z' match} unless '_' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
die q{bad 'wm \s' match} unless '_ ' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
die q{bad 'wm <' match} unless '_<' =~ m{ $rx_wiki_mark $rx_post_clo
+se }xms;
# subroutines ######################################################
sub w2h { # not exported; invoke fully-qualified
my ($wiki, # string possibly containing wiki markup
) = @_;
(my $html = $wiki) =~ s{
$rx_wml # captures to $1 $2
}
{ my ($html_tag, $body) = ($w2h_map{$1}, w2h($2));
"<$html_tag>$body</$html_tag>";
}xmsge;
return $html;
}
1;
File W2H.t:
# W2H.t 15feb20waw
use strict;
use warnings;
# use Data::Dump qw(pp dd); # for debug
use Test::More 'no_plan';
use Test::NoWarnings;
BEGIN {
use_ok 'W2H_1';
use_ok 'W2H_2';
}
use constant DEBUG => 1;
use constant {
DBPR_m_d1 => 0 && DEBUG,
};
my @Tests = (
[ '_abcd_' => '<u>abcd</u>', ],
[ '_/abcd/_' => '<u><i>abcd</i></u>', ],
[ '*_/abcd/_*' => '<b><u><i>abcd</i></u></b>', ],
[ '*_/a*b_c/d/_*' => '<b><u><i>a*b_c/d</i></u></b>', ],
[ ' *_/a*b_c/d/_* ' => ' <b><u><i>a*b_c/d</i></u></b> ', ],
[ '_abc__def_' => '<u>abc</u><u>def</u>', ],
[ '_abc__def_*ghi**jkl*/mno//pqr/'
=>
'<u>abc</u><u>def</u><b>ghi</b><b>jkl</b><i>mno</i><i>pqr</i>',
],
[ ' _abc__def_*ghi**jkl*/mno//pqr/ '
=>
' <u>abc</u><u>def</u><b>ghi</b><b>jkl</b><i>mno</i><i>pqr</i> ',
],
[ '_abc_ _def_ *ghi* *jkl* /mno/ /pqr/'
=>
'<u>abc</u> <u>def</u> <b>ghi</b> <b>jkl</b> <i>mno</i> <i>pqr</i>
+',
],
[ '_abc_ /def/' => '<u>abc</u> <i>def</i>', ],
[ ' _abc_ /def/ ' => ' <u>abc</u> <i>def</i> ', ],
[ 'x _abc_ /def/ x' => 'x <u>abc</u> <i>def</i> x', ],
[ '_abc /xyz/ cba_ /def/' => '<u>abc <i>xyz</i> cba</u> <i>def</i>',
+ ],
[ '_/one *t/w*o*/ th/r_ee_ null' => '<u><i>one <b>t/w*o</b></i> th/r
+_ee</u> null', ],
[ '_/one *two*/ th/ree_ null _/f*ur *five*/ six_ null _/se_ven *eig
+ht*/ nine_ *fail_',
=>
'<u><i>one <b>two</b></i> th/ree</u> null <u><i>f*ur <b>five</b></
+i> six</u> null <u><i>se_ven <b>eight</b></i> nine</u> *fail_',
'from pm#11112991'
],
);
# testing, testing...
FUNT:
for my $func_name (
'W2H_1::w2h', 'W2H_2::w2h',
) {
note "\n=== testing $func_name() ===\n\n";
*w2h = do { no strict 'refs'; *$func_name; };
VECTOR:
for my $ar_vector (@Tests) {
if (not ref $ar_vector) {
note $ar_vector;
next VECTOR;
}
my ($wiki, $expected, $comment) = @$ar_vector;
$comment = defined $comment ? "$comment: " : '';
my $got = w2h($wiki);
DBPR_m_d1 and diag ":$wiki:";
DBPR_m_d1 and diag ":$got:";
is $got, $expected, "${comment}'$wiki' -> '$expected'";
} # end for VECTOR
} # end for FUNT
done_testing;
exit;
# subroutines ######################################################
# none for now
Give a man a fish: <%-{-{-{-<
Re^2: wiki regex reprocessing replacement
by LanX (Saint) on Feb 16, 2020 at 11:52 UTC
|
Wow, thanks :)
And the test suite ++
> One thing I don't understand is the inclusion of > < characters in the pre- and post-markup tag delimiters
Because the repetitive solution with tf() needs to ignore previous runs.
*/_word_/* -> <b>/_word_/</b> -> <b><i>_word_</i></b> -> etc.
The recursive solution with rec() doesn't really need it, one of the reasons why I prefer this approach.
> probably because I'm not familiar with wikisyntax.
No you are not wrong, there was information missing.
In this particular case the syntax is also meant to coexist with more verbose html tags.
There are cases where one doesn't want to have a whitespace in between neighboring tags.
Just compare Re^3: Good Intentions: Wikisyntax for the Monastery and the complaint about 'ARGV'<br> not expanding.
| [reply] [d/l] [select] |
|
'--- tests added 16feb20 after pm#11113014 post ---',
'"failing" (i.e., no transformation) tests',
[ '' => '', ],
[ '*' => '*', ],
[ '*_/' => '*_/', ],
[ ' * _ / ' => ' * _ / ', ],
[ '*fail/' => '*fail/', ],
[ ' * fail / ' => ' * fail / ', ],
'possibly questionable transformations',
[ '__' => '<u></u>', ],
[ ' __ ' => ' <u></u> ', ],
[ '__ __' => '<u></u> <u></u>', ],
[ ' __ __ ' => ' <u></u> <u></u> ', ],
[ '____' => '<u></u><u></u>', '???' ],
[ ' ____ ' => ' <u></u><u></u> ', '???' ],
[ '______' => '<u></u><u></u><u></u>', '???' ],
[ ' ______ ' => ' <u></u><u></u><u></u> ', '???' ],
[ '________' => '<u></u><u></u><u></u><u></u>', '???' ],
[ ' ________ ' => ' <u></u><u></u><u></u><u></u> ', '???' ],
[ '__ __ __ __' => '<u></u> <u></u> <u></u> <u></u>', ],
[ ' __ __ __ __ ' => ' <u></u> <u></u> <u></u> <u></u> ', ],
In this particular case the syntax is also meant to coexist with more verbose html tags.
There are cases where one doesn't want to have a whitespace in between neighboring tags.
Can you supply some test cases for variations, especially WRT intermixtures with standard HTML?
Give a man a fish: <%-{-{-{-<
| [reply] [d/l] [select] |
|
> especially as regards the "questionable" ones
Yes sorry.
I didn't want to over complicate the question, and just wrote .*? between the markup.
Actually I'm using now something like (\S.*?(?<=\S)) to enforce at least one non-whitespace between the markers.
The objective of the question was "How best to allow * / _ to be chained and or nested".
The recursive approach does it already pretty well.
And actually nesting these markups is of rather low priority in the to-do list
> Can you supply some test cases for variations, especially WRT intermixtures with standard HTML?
That's my project: Wikisyntax for the Monastery =)
JS-regex is mostly compatible to Perl4 regex.
these are some tests I use ATM
sub is_tf {
my ($in,$out,$label) = @_;
is( rec( $in ) => $out => "$label: \t'$in'\t->\t'$out'" );
}
sub no_tf {
my ($in,$label) = @_;
is_tf($in,$in,$label);
}
no_tf( '**' => "no letter" );
is_tf( '*A*' => '<b>A</b>' => "one letter");
is_tf( '*A B*' ,'<b>A B</b>' , "multi word");
no_tf( '* A*' , "before non-whitespace");
no_tf( '*A *' , "after non-whitespace");
no_tf( "*A\nB*" , "line break");
is_tf( '*A *B*' ,'<b>A *B</b>' , "after non-whitespace prolonged");
is_tf( '/**/' ,'<i>**</i>' , "nested no letter");
| [reply] [d/l] [select] |
|
|