use strict; use warnings; use 5.010; package Fake::Split; use Exporter 'import'; our @EXPORT = ( qw( splitF splitF_explain splitF_match_pos splitF_pos splitF_pos2txt splitF_pos2list ), qw( splitF_info splitF_test splitF_debug splitF_case ), ); our @EXPORT_OK = qw(); # symbols to export on request use Data::Dump qw(dump dd ddx); my $debug = !!0; sub splitF_debug { $debug = shift && !!1; } my $info_level = 0; sub splitF_info { $info_level = +shift; } sub info { return if $info_level <= 0; if ( $info_level <= 1 || $debug ) { say STDERR @_ } else { say @_ } } # Create a list with offsets (pos) for the start (using @LAST_MATCH_START / @-) # and end (@LAST_MATCH_END /@+) of matches. # The list contains: FIELD, [ SEPARATOR ], FIELD, [ SEPARATOR ], ... # FIELD = field_start, field_end # SEPARATOR = separator_start, separator_end, CAPTURE_1, CAPTURE_2, ... # CAPTURE_n = capture_n_start, capture_n_end # Can not handle /PATTERN/ as first parameter. # Only compiled qr/PATTERN/ and strings can be used. sub splitF_match_pos { # ===================================================== my ( $pat, $str_ref ) = @_; my @res; my $pos_last = 0; my $pat_re = qr{$pat}; # while ( my $rv = $$str_ref =~ m{$pat_re}gc ) { # does not work my $str = $$str_ref; while ( my $rv = $str =~ m{$pat_re}gc ) { push @res, $pos_last, $-[0]; # the field my @sep = ( $-[0], $+[0] ); # the separator for my $ix ( 1 .. $#+ ) { # captures in the separator if ( defined $-[$ix] ) { push @sep, $-[$ix], $+[$ix]; } else { push @sep, -$-[0], -$+[0]; # or undef, undef } } push @res, [@sep]; $pos_last = $+[0]; } push @res, $pos_last, length($str); # rest of string return \@res; } sub splitF_case { # /PATTERN/ with special treatment in split my $pat = shift; my $pat_re = qr{$pat}; my %case; $case{$_} = !!0 for qw( pat_line_begin AWK_emul zero_width); if ( $pat eq '^' || $pat eq qr{^} ) { # split() silently optimizes /^/ to mean /^/m # ^ Matches at the beginning of the string (or line, if /m is used) info('PATTERN is /^/'); $case{pat_line_begin} = !!1; } elsif ( $pat eq ' ' ) { # emulation of AWK # / / and qr/ / is NOT emulation of AWK $case{AWK_emul} = !!1; info("PATTERN is ' ' BUT qr{\\s+} is used(AWK_emul)"); } elsif ( '' =~ m{$pat_re} ) { info("PATTERN matches between chars"); $case{zero_width} = !!1; } return \%case; } # Uses the generic splitF_match_pos. This routine handles split specific things. sub splitF_pos { # ========================================================== my ( $pat, $str_ref ) = @_; my %case = %{ splitF_case($pat) }; if ( $case{AWK_emul} ) { # any contiguous whitespace (not just a single space character) # is used as a separator; $pat = qr{\s+}; } if ( $case{pat_line_begin} ) { # split() silently optimizes /^/ to mean /^/m $pat = qr{^}m; } return \%case, [undef] if !defined $$str_ref; return \%case, splitF_match_pos( $pat, $str_ref ); } sub splitF_case2txt { my $case_href = shift; my %case = %$case_href; my $case_txt = ''; $case_txt .= $_ for map { $case{$_} ? "$_ " : () } sort keys %case; return $case_txt; } # Create a textual representation of the output from splitF_pos() sub splitF_pos2txt { my $str_ref = shift; my $case_href = shift; # returnvalue from splitF_pos() my $pos_aref = shift; # returnvalue from splitF_pos() my $ix_last = @$pos_aref; if ( defined $pos_aref && @$pos_aref == 1 ) { return ['ERROR $str is undefined'], splitF_case2txt($case_href); } my @res; my sub substring_1 { my $start = shift; my $end = shift; return substr( $$str_ref, $start, $end - $start ); } my sub separator_1 { my $pos_aref = shift; my $ix_last = @$pos_aref; my @sep; my $ix = 0; while ( $ix < $ix_last ) { my ( $start, $end ) = ( $pos_aref->[$ix], $pos_aref->[ $ix + 1 ] ); # negative pos indicates unmatched capture group push @sep, '<', $start < 0 ? 'undef' : substring_1( $start, $end ), '>'; $ix += 2; } return 'S' . join '', @sep; } my $ix = 0; while ( $ix < $ix_last ) { my ( $field_start, $field_end, $sep_aref ) = @$pos_aref[ $ix .. $ix + 3 ]; $ix += 3; push @res, 'F<' . substring_1( $field_start, $field_end ) . '>'; last if ( not defined $sep_aref ); push @res, separator_1($sep_aref); } return \@res, splitF_case2txt($case_href); } # The key part of the emulation of split. # Create a list of strings from the output of splitF_pos(). # The created list of strings is ( should be ) the same as that from split sub splitF_pos2list { # ========================================================== my $par_nof = @_; my $str_ref = shift; my $case_href = shift; # returnvalue from splitF_pos() my $pos_aref = shift; # returnvalue from splitF_pos() my $limit = shift; if ( !( defined $str_ref && !$$str_ref eq '' ) ) { return (); } my %case = %$case_href; $case{$_} = !!0 for qw( zero_width capturing lim_neg lim_omitted lim_positive ); if ( $par_nof < 3 || $par_nof > 4 ) { die 'too few or too many parameters'; } elsif ( $par_nof == 3 ) { $case{lim_omitted} = !!1; # or $limit == 0 } elsif ( $par_nof == 4 ) { if ( $limit < 0 ) { $case{lim_neg} = !!1; $case{lim_omitted} = !!0; } elsif ( $limit > 0 ) { $case{lim_positive} = !!1; $case{lim_omitted} = !!0; } else { $case{lim_omitted} = !!1; # or $limit == 0 } } my sub substring { my $start = shift; my $end = shift; return substr( $$str_ref, $start, $end - $start ); } my @res; my sub separator { my $pos_aref = shift; my $ix_last = @$pos_aref - 1; my $ix = 2; # skip the match of the sperator while ( $ix < $ix_last ) { my ( $start, $end ) = ( $pos_aref->[$ix], $pos_aref->[ $ix + 1 ] ); # negative pos indicates unmatched capture group push @res, $start < 0 ? undef : substring( $start, $end ); $ix += 2; } } my $ix = 0; my $ix_last = @$pos_aref; my $str_length = length $$str_ref; my $field_nof = 0; while ( $ix < $ix_last ) { my ( $field_start, $field_end, $sep_aref ) = @$pos_aref[ $ix .. $ix + 3 ]; my $last_field = $ix >= $ix_last - 2; if ( $ix == 0 ) { # first field + seperator if ($last_field) { info('Only one field'); push @res, substring( $field_start, $str_length ); last; } $case{zero_width} = !!1 if $sep_aref->[1] == 0; $case{capturing} = @$sep_aref >= 3; if ( $case{zero_width} ) { # "a zero-width match at the beginning never produces an empty field" info('SKIP first field, zero width separator'); next; } if ( $field_start == 0 && $field_end == 0 && ( ( !$case{lim_omitted} && !$case{lim_positive} ) || $case{AWK_emul} ) ) { info('SKIP first field+separator, both are empty'); next; } } if ( $field_start == $field_end && $sep_aref && $field_start == $sep_aref->[0] && $field_start == $sep_aref->[1] ) { info('SKIP intermediate field+separator, both are empty'); next; } $field_nof++; if ( $case{lim_positive} && $field_nof >= $limit ) { info("Reached nof field limit $limit"); push @res, substring( $field_start, $str_length ); last; } if ( $last_field && $field_start == $field_end && $case{lim_positive} && $case{lim_neg} ) { info('SKIP the last field'); last; } # add field to result push @res, substring( $field_start, $field_end ); # add separator to result next if ( !defined $sep_aref || !$case{capturing} ); separator($sep_aref); } continue { $ix += 3; } # $ix for next field my @removed; if ( $case{lim_omitted} ) { push @removed, pop @res while ( @res && ( !defined $res[-1] || $res[-1] eq '' ) ); } info( 'REMOVE from end: ', dump @removed ) if @removed; return @res; } sub split_core { # facade to core split ====================================== info "\nsplit_core", dump @_; my $par_nof = @_; if ( $par_nof == 0 ) { warn 'ERROR'; } elsif ( $par_nof == 1 ) { return split $_[0]; } elsif ( $par_nof == 2 ) { return split $_[0], $_[1]; } elsif ( $par_nof == 3 ) { return split $_[0], $_[1], $_[2]; } else { warn 'ERROR' } } sub splitF_test($;$$) { # test emulation of split ================================================ info( "\nsplitF_test", dump @_ ); my @rv_F = splitF(@_); my @rv_c = split_core(@_); info( 'split_c: ', dump @rv_c ); return [@rv_F], [@rv_c], dump @_; } sub splitF_explain($;$$) { # explains the output from splitF_pos() =========================== info( "\nsplitF_explain", dump @_ ); my ( $case_href, $pos_aref ) = splitF_pos( $_[0], \$_[1] ); return splitF_pos2txt( \$_[1], $case_href, $pos_aref ); } # Emulation of split ========================================================== # in /PATTERN/, the first parameter, the match operator /STRING/ must be replaced # with qr/STRING/, the compiled regular expression. sub splitF($;$$) { info( "\nsplitF", dump @_ ); my ( $case_href, $pos_aref ) = splitF_pos( $_[0], \$_[1] ); my @rv = splitF_pos2list( \$_[1], $case_href, $pos_aref, @_ > 2 ? $_[2] : () ); info( 'splitF: ', dump @rv ); return @rv; } !!1; #### use strict; use warnings; use 5.010; use Test::More; use lib 'lib'; use Fake::Split; splitF_info( 1 ); plan tests => 73; is_deeply splitF_test(" ", "a b c"); is_deeply splitF_test("^", "a\nb\nc"); is_deeply splitF_test(qr/:/, undef); is_deeply splitF_test(":", "a:b:c"); is_deeply splitF_test(qr/:b:/, "a:b:c"); is_deeply splitF_test(qr//, "abc\n"); is_deeply splitF_test(qr/:/, "a:b:c::::"); is_deeply splitF_test(" ", " a b\tc \t d "); is_deeply splitF_test(qr/ */, "foo bar bie\tdoll"); is_deeply splitF_test(qr/ /, "a b c"); is_deeply splitF_test(" ", "1 2 3 4 5 6", 3); is_deeply splitF_test(" ", "1 2 3 4 5 6", 4); is_deeply splitF_test(qr/:/, "1:2:3:4:5:6:::", 999); is_deeply splitF_test(" ", "1 2 3 4 5 6", 2); is_deeply splitF_test(qr/,|(-)/, "1-10,20,,,"); is_deeply splitF_test(qr/,|(-)/, "1-10,20,,,", 10); is_deeply splitF_test(qr/x/, "", -1); is_deeply splitF_test(qr/x/, "", 1); is_deeply splitF_test(qr/(p+)/, "", -1); is_deeply splitF_test(qr/.?/, "", -1); is_deeply splitF_test(qr/^a/m, "a b a\na d a", 20); is_deeply splitF_test(qr/a$/m, "a b a\na d a", 20); is_deeply splitF_test(qr/^aa/m, "aa b aa\naa d aa", 20); is_deeply splitF_test(qr/aa$/m, "aa b aa\naa d aa", 20); is_deeply splitF_test(qr/\s*:\s*/, "a : b :c: d"); is_deeply splitF_test(1, "p1q1r1s"); is_deeply splitF_test(qr/^/, "ab\ncd\nef\n"); is_deeply splitF_test(qr/\A/, "ab\ncd\nef\n"); is_deeply splitF_test(qr/(?=\w)/, "rm b"); is_deeply splitF_test(qr//, v1.20.300.4000.50000.4000.300.20.1); is_deeply splitF_test(qr/\x{FE}/, "\xFF\xFE\xFD"); is_deeply splitF_test(qr/(\x{FE}\xFE)/, "\xFF\xFF\xFE\xFE\xFD\xFD"); is_deeply splitF_test(qr//, "\x{4D2}{\x{929}"); is_deeply splitF_test(qr/A/, "\x{4D2}A\x{929}"); is_deeply splitF_test(qr//, "\x{B36C}\x{5A8C}\x{FF5B}\x{5079}\x{505B}"); is_deeply splitF_test(qr/\x40/, " \@\x80\x{100}\x80\@ "); is_deeply splitF_test(qr/(?^u:\x{100})/, " \@\x80\x{100}\x80\@ "); is_deeply splitF_test(qr/(?^u:\x{80}\x{100}\x{80})/, " \@\x80\x{100}\x80\@ "); is_deeply splitF_test(qr/\x40\x{80}/, " \@\x80\x{100}\x80\@ "); is_deeply splitF_test(qr/[\x40\x{80}]+/, " \@\x80\x{100}\x80\@ "); is_deeply splitF_test(qr//, "ABC\x{263A}"); is_deeply splitF_test(qr/\xFE/, "\xFF\xFE\xFD"); is_deeply splitF_test(qr/\s+/, "hello cruel world"); is_deeply splitF_test(qr/ll/, "hello cruel world"); is_deeply splitF_test(qr/(A)|B/, "1B2"); is_deeply splitF_test(qr/\r?\n/, "\x{10F1FF}\n"); is_deeply splitF_test(qr/[,]/, "readin,database,readout"); is_deeply splitF_test(qr/[, ]+/, "a,b"); is_deeply splitF_test(qr/(?^u:ä)/, "a\xE4b"); is_deeply splitF_test(qr/(?^u:ä)/, "axb"); is_deeply splitF_test(qr/,/, ""); is_deeply splitF_test(qr/,/, ",,,,,"); is_deeply splitF_test(" \0 ", "ABC \0 FOO \0 XYZ"); is_deeply splitF_test(qr/ \0 /, "ABC \0 FOO \0 XYZ"); is_deeply splitF_test(1, "", {}); is_deeply splitF_test(qr/::/, "Font::GlyphNames"); is_deeply splitF_test(" ", "foo bar"); is_deeply splitF_test(qr/ /, "foo bar"); is_deeply splitF_test(qr/\s/, " a b c "); is_deeply splitF_test(qr/ /, " a b c "); is_deeply splitF_test(" ", " a b c "); is_deeply splitF_test(" ", " a \tb c "); is_deeply splitF_test(" ", " foo "); is_deeply splitF_test(qr/ /, " foo "); is_deeply splitF_test(qr//, undef, 0); is_deeply splitF_test(qr//, "foobarbaz"); is_deeply splitF_test(qr//, "abc"); #is_deeply splitF_test(qr/-(?{ $c++ })/, "a-b-c"); is_deeply splitF_test(qr/:/, "a:b:c"); is_deeply splitF_test(qr/:/, "a:b:c:d:e"); is_deeply splitF_test(qr/-/, "-"); is_deeply splitF_test(" ", ""); is_deeply splitF_test("", "ab"); is_deeply splitF_test(";", "a;b"); #### if ( $field_start == $field_end && $sep_aref && $field_start == $sep_aref->[0] && $field_start == $sep_aref->[1] ) { info('SKIP intermediate field+separator, both are empty'); next; } #### # while ( my $rv = $$str_ref =~ m{$pat_re}gc ) { does not work # but this works: my $str = $$str_ref; while ( my $rv = $str =~ m{$pat_re}gc ) {