Beefy Boxes and Bandwidth Generously Provided by pair Networks
laziness, impatience, and hubris
 
PerlMonks  

Re: truncate string to byte count

by ikegami (Pope)
on Feb 28, 2019 at 20:43 UTC ( #1230687=note: print w/replies, xml ) Need Help??


in reply to truncate string to byte count

A valid cut is one that isn't followed by a continuation byte (0b10xx_xxxx).

sub truncate_utf8 { my ($utf8, $len) = @_; $len += 0; # Make sure $len is a number. return $utf8 =~ s/^.{0,$len}\K(?![\x80-\xBF]).*//sr; }
or
sub truncate_utf8 { my ($utf8, $len) = @_; return $utf8 if length($utf8) <= $len; my $next = substr($utf8, $len, length($utf8)-$len, ''); $next = chop($utf8) while (ord($next) & 0xC0) == 0x80; return $utf8; }

Both of these take text that is already encoded using UTF-8.

Update: Fixed typo mentioned by haukex.
Update: Made clear what the input should be.

Replies are listed 'Best First'.
Re^2: truncate string to byte count
by haukex (Bishop) on Feb 28, 2019 at 21:04 UTC
    sub truncate_utf8 { my ($utf8, $len) = @_; $len += 0; # Make sure $len is a number. return $utf8 =~ s/^.{0,$len)\K(?![\x80-\xBF]).*//sr; }

    Unmatched ) in regex, and if I fix that:

    use warnings; use strict; use Test::More tests=>15; use open qw/:std :utf8/; my $in = "\N{U+CF} \N{U+2764} \N{U+1F42A}"; is truncate_utf8($in, 11), "\N{U+CF} \N{U+2764} \N{U+1F42A}"; is truncate_utf8($in, $_), "\N{U+CF} \N{U+2764} " for 10,9,8,7; is truncate_utf8($in, 6), "\N{U+CF} \N{U+2764}"; is truncate_utf8($in, $_), "\N{U+CF} " for 5,4,3; is truncate_utf8($in, 2), "\N{U+CF}"; is truncate_utf8($in, $_), "" for 1,0; $in = "\xE4b"; utf8::downgrade($in); # make sure this really is a non-UTF8 string is truncate_utf8($in, 2), "\xE4b"; is truncate_utf8($in, 1), "\xE4"; is truncate_utf8($in, 0), ""; sub truncate_utf8 { my ($utf8, $len) = @_; $len += 0; # Make sure $len is a number. return $utf8 =~ s/^.{0,$len}\K(?![\x80-\xBF]).*//sr; } __END__ 1..15 ok 1 not ok 2 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 3 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 4 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 5 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 6 # Failed test at x.pl line 10. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084;' not ok 7 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' ' not ok 8 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; ' # expected: ' ' not ok 9 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084;' # expected: ' ' not ok 10 # Failed test at x.pl line 12. # got: '&#65533; ' # expected: '&#65533;' not ok 11 # Failed test at x.pl line 13. # got: '&#65533;' # expected: '' ok 12 ok 13 ok 14 ok 15 # Looks like you failed 10 tests of 15.
    sub truncate_utf8 { my ($utf8, $len) = @_; return $utf8 if length($utf8) <= $len; my $next = substr($utf8, $len, length($utf8)-$len, ''); $next = chop($utf8) while (ord($next) & 0xC0) == 0x80; return $utf8; }
    use warnings; use strict; use Test::More tests=>15; use open qw/:std :utf8/; my $in = "\N{U+CF} \N{U+2764} \N{U+1F42A}"; is truncate_utf8($in, 11), "\N{U+CF} \N{U+2764} \N{U+1F42A}"; is truncate_utf8($in, $_), "\N{U+CF} \N{U+2764} " for 10,9,8,7; is truncate_utf8($in, 6), "\N{U+CF} \N{U+2764}"; is truncate_utf8($in, $_), "\N{U+CF} " for 5,4,3; is truncate_utf8($in, 2), "\N{U+CF}"; is truncate_utf8($in, $_), "" for 1,0; $in = "\xE4b"; utf8::downgrade($in); # make sure this really is a non-UTF8 string is truncate_utf8($in, 2), "\xE4b"; is truncate_utf8($in, 1), "\xE4"; is truncate_utf8($in, 0), ""; sub truncate_utf8 { my ($utf8, $len) = @_; return $utf8 if length($utf8) <= $len; my $next = substr($utf8, $len, length($utf8)-$len, ''); $next = chop($utf8) while (ord($next) & 0xC0) == 0x80; return $utf8; } __END__ 1..15 ok 1 not ok 2 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 3 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 4 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 5 # Failed test at x.pl line 9. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084; ' not ok 6 # Failed test at x.pl line 10. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' &#10084;' not ok 7 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; &#128042;' # expected: ' ' not ok 8 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084; ' # expected: ' ' not ok 9 # Failed test at x.pl line 11. Wide character in print at /opt/perl5.28/lib/site_perl/5.28.1/Test2/Fo +rmatter/TAP.pm line 112. # got: ' &#10084;' # expected: ' ' not ok 10 # Failed test at x.pl line 12. # got: '&#65533; ' # expected: '&#65533;' not ok 11 # Failed test at x.pl line 13. # got: '&#65533;' # expected: '' ok 12 ok 13 ok 14 ok 15 # Looks like you failed 10 tests of 15.

      Tests 1 to 11 are incorrect because they don't provide UTF-8. Replace

      is truncate_utf8($in, ...), "...";
      with
      is truncate_utf8(encode_utf8($in), ...), encode_utf8("...");
        Tests 1 to 12 are incorrect because they don't provide UTF-8.

        I think you're making assumptions about what the OP's input data looks like...

        Update:

        Tests 1 to 11 ... Replace ...

        It is uncool to update a node in a way that renders replies confusing or meaningless.

        Update 2: How about "is is uncool to make someone chase down all of your ninja edits and wonder when the ninja editing will be done." You made several other ninja edits (like this one) to your nodes and I had to update my replies several times while composing them.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://1230687]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others wandering the Monastery: (6)
As of 2021-03-01 23:23 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    My favorite kind of desktop background is:











    Results (28 votes). Check out past polls.

    Notices?