use strict;
use warnings;
use 5.010;
package Fake::Split;
use Exporter 'import';
our @EXPORT = (
qw( splitF splitF_explain splitF_match_pos splitF_pos splitF_pos2txt splitF_pos2list ),
qw( splitF_info splitF_test splitF_debug splitF_case ),
);
our @EXPORT_OK = qw(); # symbols to export on request
use Data::Dump qw(dump dd ddx);
my $debug = !!0;
sub splitF_debug { $debug = shift && !!1; }
my $info_level = 0;
sub splitF_info { $info_level = +shift; }
sub info {
return if $info_level <= 0;
if ( $info_level <= 1 || $debug ) { say STDERR @_ }
else { say @_ }
}
# Create a list with offsets (pos) for the start (using @LAST_MATCH_START / @-)
# and end (@LAST_MATCH_END /@+) of matches.
# The list contains: FIELD, [ SEPARATOR ], FIELD, [ SEPARATOR ], ...
# FIELD = field_start, field_end
# SEPARATOR = separator_start, separator_end, CAPTURE_1, CAPTURE_2, ...
# CAPTURE_n = capture_n_start, capture_n_end
# Can not handle /PATTERN/ as first parameter.
# Only compiled qr/PATTERN/ and strings can be used.
sub splitF_match_pos { # =====================================================
my ( $pat, $str_ref ) = @_;
my @res;
my $pos_last = 0;
my $pat_re = qr{$pat};
# while ( my $rv = $$str_ref =~ m{$pat_re}gc ) {
# does not work
my $str = $$str_ref;
while ( my $rv = $str =~ m{$pat_re}gc ) {
push @res, $pos_last, $-[0]; # the field
my @sep = ( $-[0], $+[0] ); # the separator
for my $ix ( 1 .. $#+ ) { # captures in the separator
if ( defined $-[$ix] ) {
push @sep, $-[$ix], $+[$ix];
}
else {
push @sep, -$-[0], -$+[0]; # or undef, undef
}
}
push @res, [@sep];
$pos_last = $+[0];
}
push @res, $pos_last, length($str); # rest of string
return \@res;
}
sub splitF_case { # /PATTERN/ with special treatment in split
my $pat = shift;
my $pat_re = qr{$pat};
my %case;
$case{$_} = !!0
for qw( pat_line_begin AWK_emul zero_width);
if ( $pat eq '^' || $pat eq qr{^} ) {
# split() silently optimizes /^/ to mean /^/m
# ^ Matches at the beginning of the string (or line, if /m is used)
info('PATTERN is /^/');
$case{pat_line_begin} = !!1;
}
elsif ( $pat eq ' ' ) { # emulation of AWK
# / / and qr/ / is NOT emulation of AWK
$case{AWK_emul} = !!1;
info("PATTERN is ' ' BUT qr{\\s+} is used(AWK_emul)");
}
elsif ( '' =~ m{$pat_re} ) {
info("PATTERN matches between chars");
$case{zero_width} = !!1;
}
return \%case;
}
# Uses the generic splitF_match_pos. This routine handles split specific things.
sub splitF_pos { # ==========================================================
my ( $pat, $str_ref ) = @_;
my %case = %{ splitF_case($pat) };
if ( $case{AWK_emul} ) {
# any contiguous whitespace (not just a single space character)
# is used as a separator;
$pat = qr{\s+};
}
if ( $case{pat_line_begin} ) {
# split() silently optimizes /^/ to mean /^/m
$pat = qr{^}m;
}
return \%case, [undef] if !defined $$str_ref;
return \%case, splitF_match_pos( $pat, $str_ref );
}
sub splitF_case2txt {
my $case_href = shift;
my %case = %$case_href;
my $case_txt = '';
$case_txt .= $_ for map { $case{$_} ? "$_ " : () } sort keys %case;
return $case_txt;
}
# Create a textual representation of the output from splitF_pos()
sub splitF_pos2txt {
my $str_ref = shift;
my $case_href = shift; # returnvalue from splitF_pos()
my $pos_aref = shift; # returnvalue from splitF_pos()
my $ix_last = @$pos_aref;
if ( defined $pos_aref && @$pos_aref == 1 ) {
return ['ERROR $str is undefined'], splitF_case2txt($case_href);
}
my @res;
my sub substring_1 {
my $start = shift;
my $end = shift;
return substr( $$str_ref, $start, $end - $start );
}
my sub separator_1 {
my $pos_aref = shift;
my $ix_last = @$pos_aref;
my @sep;
my $ix = 0;
while ( $ix < $ix_last ) {
my ( $start, $end ) = ( $pos_aref->[$ix], $pos_aref->[ $ix + 1 ] );
# negative pos indicates unmatched capture group
push @sep, '<', $start < 0 ? 'undef' : substring_1( $start, $end ),
'>';
$ix += 2;
}
return 'S' . join '', @sep;
}
my $ix = 0;
while ( $ix < $ix_last ) {
my ( $field_start, $field_end, $sep_aref ) =
@$pos_aref[ $ix .. $ix + 3 ];
$ix += 3;
push @res, 'F<' . substring_1( $field_start, $field_end ) . '>';
last if ( not defined $sep_aref );
push @res, separator_1($sep_aref);
}
return \@res, splitF_case2txt($case_href);
}
# The key part of the emulation of split.
# Create a list of strings from the output of splitF_pos().
# The created list of strings is ( should be ) the same as that from split
sub splitF_pos2list
{ # ==========================================================
my $par_nof = @_;
my $str_ref = shift;
my $case_href = shift; # returnvalue from splitF_pos()
my $pos_aref = shift; # returnvalue from splitF_pos()
my $limit = shift;
if ( !( defined $str_ref && !$$str_ref eq '' ) ) {
return ();
}
my %case = %$case_href;
$case{$_} = !!0
for qw( zero_width capturing lim_neg lim_omitted lim_positive );
if ( $par_nof < 3 || $par_nof > 4 ) {
die 'too few or too many parameters';
}
elsif ( $par_nof == 3 ) {
$case{lim_omitted} = !!1; # or $limit == 0
}
elsif ( $par_nof == 4 ) {
if ( $limit < 0 ) {
$case{lim_neg} = !!1;
$case{lim_omitted} = !!0;
}
elsif ( $limit > 0 ) {
$case{lim_positive} = !!1;
$case{lim_omitted} = !!0;
}
else {
$case{lim_omitted} = !!1; # or $limit == 0
}
}
my sub substring {
my $start = shift;
my $end = shift;
return substr( $$str_ref, $start, $end - $start );
}
my @res;
my sub separator {
my $pos_aref = shift;
my $ix_last = @$pos_aref - 1;
my $ix = 2; # skip the match of the sperator
while ( $ix < $ix_last ) {
my ( $start, $end ) = ( $pos_aref->[$ix], $pos_aref->[ $ix + 1 ] );
# negative pos indicates unmatched capture group
push @res, $start < 0 ? undef : substring( $start, $end );
$ix += 2;
}
}
my $ix = 0;
my $ix_last = @$pos_aref;
my $str_length = length $$str_ref;
my $field_nof = 0;
while ( $ix < $ix_last ) {
my ( $field_start, $field_end, $sep_aref ) =
@$pos_aref[ $ix .. $ix + 3 ];
my $last_field = $ix >= $ix_last - 2;
if ( $ix == 0 ) { # first field + seperator
if ($last_field) {
info('Only one field');
push @res, substring( $field_start, $str_length );
last;
}
$case{zero_width} = !!1 if $sep_aref->[1] == 0;
$case{capturing} = @$sep_aref >= 3;
if ( $case{zero_width} ) {
# "a zero-width match at the beginning never produces an empty field"
info('SKIP first field, zero width separator');
next;
}
if (
$field_start == 0
&& $field_end == 0
&& ( ( !$case{lim_omitted} && !$case{lim_positive} )
|| $case{AWK_emul} )
)
{
info('SKIP first field+separator, both are empty');
next;
}
}
if ( $field_start == $field_end
&& $sep_aref
&& $field_start == $sep_aref->[0]
&& $field_start == $sep_aref->[1] )
{
info('SKIP intermediate field+separator, both are empty');
next;
}
$field_nof++;
if ( $case{lim_positive} && $field_nof >= $limit ) {
info("Reached nof field limit $limit");
push @res, substring( $field_start, $str_length );
last;
}
if ( $last_field
&& $field_start == $field_end
&& $case{lim_positive}
&& $case{lim_neg} )
{
info('SKIP the last field');
last;
}
# add field to result
push @res, substring( $field_start, $field_end );
# add separator to result
next if ( !defined $sep_aref || !$case{capturing} );
separator($sep_aref);
}
continue { $ix += 3; } # $ix for next field
my @removed;
if ( $case{lim_omitted} ) {
push @removed, pop @res
while ( @res && ( !defined $res[-1] || $res[-1] eq '' ) );
}
info( 'REMOVE from end: ', dump @removed ) if @removed;
return @res;
}
sub split_core { # facade to core split ======================================
info "\nsplit_core", dump @_;
my $par_nof = @_;
if ( $par_nof == 0 ) { warn 'ERROR'; }
elsif ( $par_nof == 1 ) { return split $_[0]; }
elsif ( $par_nof == 2 ) { return split $_[0], $_[1]; }
elsif ( $par_nof == 3 ) { return split $_[0], $_[1], $_[2]; }
else { warn 'ERROR' }
}
sub splitF_test($;$$)
{ # test emulation of split ================================================
info( "\nsplitF_test", dump @_ );
my @rv_F = splitF(@_);
my @rv_c = split_core(@_);
info( 'split_c: ', dump @rv_c );
return [@rv_F], [@rv_c], dump @_;
}
sub splitF_explain($;$$)
{ # explains the output from splitF_pos() ===========================
info( "\nsplitF_explain", dump @_ );
my ( $case_href, $pos_aref ) = splitF_pos( $_[0], \$_[1] );
return splitF_pos2txt( \$_[1], $case_href, $pos_aref );
}
# Emulation of split ==========================================================
# in /PATTERN/, the first parameter, the match operator /STRING/ must be replaced
# with qr/STRING/, the compiled regular expression.
sub splitF($;$$) {
info( "\nsplitF", dump @_ );
my ( $case_href, $pos_aref ) = splitF_pos( $_[0], \$_[1] );
my @rv =
splitF_pos2list( \$_[1], $case_href, $pos_aref, @_ > 2 ? $_[2] : () );
info( 'splitF: ', dump @rv );
return @rv;
}
!!1;
####
use strict;
use warnings;
use 5.010;
use Test::More;
use lib 'lib';
use Fake::Split;
splitF_info( 1 );
plan tests => 73;
is_deeply splitF_test(" ", "a b c");
is_deeply splitF_test("^", "a\nb\nc");
is_deeply splitF_test(qr/:/, undef);
is_deeply splitF_test(":", "a:b:c");
is_deeply splitF_test(qr/:b:/, "a:b:c");
is_deeply splitF_test(qr//, "abc\n");
is_deeply splitF_test(qr/:/, "a:b:c::::");
is_deeply splitF_test(" ", " a b\tc \t d ");
is_deeply splitF_test(qr/ */, "foo bar bie\tdoll");
is_deeply splitF_test(qr/ /, "a b c");
is_deeply splitF_test(" ", "1 2 3 4 5 6", 3);
is_deeply splitF_test(" ", "1 2 3 4 5 6", 4);
is_deeply splitF_test(qr/:/, "1:2:3:4:5:6:::", 999);
is_deeply splitF_test(" ", "1 2 3 4 5 6", 2);
is_deeply splitF_test(qr/,|(-)/, "1-10,20,,,");
is_deeply splitF_test(qr/,|(-)/, "1-10,20,,,", 10);
is_deeply splitF_test(qr/x/, "", -1);
is_deeply splitF_test(qr/x/, "", 1);
is_deeply splitF_test(qr/(p+)/, "", -1);
is_deeply splitF_test(qr/.?/, "", -1);
is_deeply splitF_test(qr/^a/m, "a b a\na d a", 20);
is_deeply splitF_test(qr/a$/m, "a b a\na d a", 20);
is_deeply splitF_test(qr/^aa/m, "aa b aa\naa d aa", 20);
is_deeply splitF_test(qr/aa$/m, "aa b aa\naa d aa", 20);
is_deeply splitF_test(qr/\s*:\s*/, "a : b :c: d");
is_deeply splitF_test(1, "p1q1r1s");
is_deeply splitF_test(qr/^/, "ab\ncd\nef\n");
is_deeply splitF_test(qr/\A/, "ab\ncd\nef\n");
is_deeply splitF_test(qr/(?=\w)/, "rm b");
is_deeply splitF_test(qr//, v1.20.300.4000.50000.4000.300.20.1);
is_deeply splitF_test(qr/\x{FE}/, "\xFF\xFE\xFD");
is_deeply splitF_test(qr/(\x{FE}\xFE)/, "\xFF\xFF\xFE\xFE\xFD\xFD");
is_deeply splitF_test(qr//, "\x{4D2}{\x{929}");
is_deeply splitF_test(qr/A/, "\x{4D2}A\x{929}");
is_deeply splitF_test(qr//, "\x{B36C}\x{5A8C}\x{FF5B}\x{5079}\x{505B}");
is_deeply splitF_test(qr/\x40/, " \@\x80\x{100}\x80\@ ");
is_deeply splitF_test(qr/(?^u:\x{100})/, " \@\x80\x{100}\x80\@ ");
is_deeply splitF_test(qr/(?^u:\x{80}\x{100}\x{80})/, " \@\x80\x{100}\x80\@ ");
is_deeply splitF_test(qr/\x40\x{80}/, " \@\x80\x{100}\x80\@ ");
is_deeply splitF_test(qr/[\x40\x{80}]+/, " \@\x80\x{100}\x80\@ ");
is_deeply splitF_test(qr//, "ABC\x{263A}");
is_deeply splitF_test(qr/\xFE/, "\xFF\xFE\xFD");
is_deeply splitF_test(qr/\s+/, "hello cruel world");
is_deeply splitF_test(qr/ll/, "hello cruel world");
is_deeply splitF_test(qr/(A)|B/, "1B2");
is_deeply splitF_test(qr/\r?\n/, "\x{10F1FF}\n");
is_deeply splitF_test(qr/[,]/, "readin,database,readout");
is_deeply splitF_test(qr/[, ]+/, "a,b");
is_deeply splitF_test(qr/(?^u:ä)/, "a\xE4b");
is_deeply splitF_test(qr/(?^u:ä)/, "axb");
is_deeply splitF_test(qr/,/, "");
is_deeply splitF_test(qr/,/, ",,,,,");
is_deeply splitF_test(" \0 ", "ABC \0 FOO \0 XYZ");
is_deeply splitF_test(qr/ \0 /, "ABC \0 FOO \0 XYZ");
is_deeply splitF_test(1, "", {});
is_deeply splitF_test(qr/::/, "Font::GlyphNames");
is_deeply splitF_test(" ", "foo bar");
is_deeply splitF_test(qr/ /, "foo bar");
is_deeply splitF_test(qr/\s/, " a b c ");
is_deeply splitF_test(qr/ /, " a b c ");
is_deeply splitF_test(" ", " a b c ");
is_deeply splitF_test(" ", " a \tb c ");
is_deeply splitF_test(" ", " foo ");
is_deeply splitF_test(qr/ /, " foo ");
is_deeply splitF_test(qr//, undef, 0);
is_deeply splitF_test(qr//, "foobarbaz");
is_deeply splitF_test(qr//, "abc");
#is_deeply splitF_test(qr/-(?{ $c++ })/, "a-b-c");
is_deeply splitF_test(qr/:/, "a:b:c");
is_deeply splitF_test(qr/:/, "a:b:c:d:e");
is_deeply splitF_test(qr/-/, "-");
is_deeply splitF_test(" ", "");
is_deeply splitF_test("", "ab");
is_deeply splitF_test(";", "a;b");
##
##
if ( $field_start == $field_end
&& $sep_aref
&& $field_start == $sep_aref->[0]
&& $field_start == $sep_aref->[1] )
{
info('SKIP intermediate field+separator, both are empty');
next;
}
##
##
# while ( my $rv = $$str_ref =~ m{$pat_re}gc ) { does not work
# but this works:
my $str = $$str_ref;
while ( my $rv = $str =~ m{$pat_re}gc ) {