An easier solution is to dodge the UTF-8 problem. The slowness of length is because (from
length)
length() normally deals in logical characters, not physical bytes
Essentially, in order to know how many characters are in the string,
length has to interrogate every byte to see if it is part of a longer character. (Incidentally, your timings look linear, not exponential to me). You can avoid this challenge if instead of storing the string as you encounter it, store it encoded:
{
package LenTestC;
use Encode;
sub new {
my $class = shift;
my $self = '';
return bless \$self, $class;
}
sub add {
my ($self, $data) = @_;
$$self .= Encode::encode_utf8($data);
}
sub len {
my $self = shift;
return length $$self;
}
}
The target string never gets upgraded to UTF-8, and thus the fast
length algorithm can be used.
Note that your print/tell solution did the same kind of accounting, reporting bytes instead of characters.
use strict;
use warnings;
use feature 'say';
use feature 'state';
use utf8;
use Time::HiRes;
$|++;
my $chunk = '€' x 256;
my $td = Time::HiRes::time;
my $tf;
my $l;
say "with length()";
my $str = new LenTestA;
for my $n (1..15_000){
state $count = 0;
$str->add($chunk);
$l = $str->len;
$count++;
if ($count % 1000 == 0){
$tf = Time::HiRes::time;
say sprintf "%12d L=%10d t=%f", $n, $l, $tf-$td;
$td = $tf;
}
}
$td = Time::HiRes::time;
say "\nwith a scalar";
$str = new LenTestB;
for my $n (1..15_000){
state $count = 0;
$str->add($chunk);
$l = $str->len;
$count++;
if ($count % 1000 == 0){
$tf = Time::HiRes::time;
say sprintf "%12d L=%10d t=%f", $n, $l, $tf-$td;
$td = $tf;
}
}
say "\nwith encode/length()";
$str = new LenTestC;
for my $n (1..15_000){
state $count = 0;
$str->add($chunk);
$l = $str->len;
$count++;
if ($count % 1000 == 0){
$tf = Time::HiRes::time;
say sprintf "%12d L=%10d t=%f", $n, $l, $tf-$td;
$td = $tf;
}
}
{
package LenTestA;
sub new {
my $class = shift;
my $self = '';
return bless \$self, $class;
}
sub add {
my ($self, $data) = @_;
$$self .= $data;
}
sub len {
my $self = shift;
return length $$self;
}
}
{
package LenTestB;
my $len;
sub new {
my $class = shift;
my $self = '';
return bless \$self, $class;
}
sub add {
my ($self, $data) = @_;
$$self .= $data;
$len += length($data);
}
sub len {
my $self = shift;
return $len;
}
}
{
package LenTestC;
use Encode;
sub new {
my $class = shift;
my $self = '';
return bless \$self, $class;
}
sub add {
my ($self, $data) = @_;
$$self .= Encode::encode_utf8($data);
}
sub len {
my $self = shift;
return length $$self;
}
}
outputs
with length()
1000 L= 256000 t=0.510051
2000 L= 512000 t=1.387138
3000 L= 768000 t=2.304231
4000 L= 1024000 t=3.246324
5000 L= 1280000 t=4.112412
6000 L= 1536000 t=5.093509
7000 L= 1792000 t=5.957596
8000 L= 2048000 t=6.853685
9000 L= 2304000 t=9.705970
10000 L= 2560000 t=9.114912
11000 L= 2816000 t=9.906990
12000 L= 3072000 t=11.083109
13000 L= 3328000 t=12.515251
14000 L= 3584000 t=12.456246
15000 L= 3840000 t=13.957395
with a scalar
1000 L= 256000 t=0.021152
2000 L= 512000 t=0.021664
3000 L= 768000 t=0.026949
4000 L= 1024000 t=0.025393
5000 L= 1280000 t=0.021830
6000 L= 1536000 t=0.022298
7000 L= 1792000 t=0.022668
8000 L= 2048000 t=0.021850
9000 L= 2304000 t=0.026711
10000 L= 2560000 t=0.019835
11000 L= 2816000 t=0.023417
12000 L= 3072000 t=0.020025
13000 L= 3328000 t=0.021878
14000 L= 3584000 t=0.020085
15000 L= 3840000 t=0.019838
with encode/length()
1000 L= 256000 t=0.044469
2000 L= 512000 t=0.037547
3000 L= 768000 t=0.038610
4000 L= 1024000 t=0.040161
5000 L= 1280000 t=0.039640
6000 L= 1536000 t=0.041329
7000 L= 1792000 t=0.038967
8000 L= 2048000 t=0.037193
9000 L= 2304000 t=0.040582
10000 L= 2560000 t=0.042830
11000 L= 2816000 t=0.039120
12000 L= 3072000 t=0.038353
13000 L= 3328000 t=0.047136
14000 L= 3584000 t=0.037603
15000 L= 3840000 t=0.036865
#11929 First ask yourself `How would I do this without a computer?' Then have the computer do it the same way.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.