Re^2: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer)

I'm a really sore^Hthirsty loser. So I wrote an XS function that did the replacement in-place by operating directly on the SV'S char vector. It's faster than your swap function but still slower than my s///-pos stuff, I don't really know why.

use Inline C => q[
void avar_cee(SV *sv1, SV *sv2)
{
    char *sv1p, *sv2p;
    STRLEN sv1len, sv2len;
    SV *sv1_sv;
    STRLEN i;

    if (!SvROK(sv1) || !SvPOK((SV*)SvRV(sv1)) || !SvPOK(sv2)) {
        croak("Usage: avar_cee(\$s1, $s2)");
    }

    sv1_sv = (SV*)SvRV(sv1);
    sv1p = SvPV(sv1_sv, sv1len);
    sv2p = SvPV(sv2,    sv2len);

    if (sv1len != sv2len) {
       croak("The given strings must be of the same length");
    }

    for (i = 0; i < sv2len; i++) {
        if (sv1p[i] == '\0') {
            sv1p[i] = sv2p[i];
        }
    }
}
[download]

It could be sped up with something like OpenMP by putting #pragma omp paralell for before the for loop but I didn't have gcc 4.2 to test it. I changed the test script so that it wouldn't be unfair to the in-place functions. Previously I was copying the data anyway so that testing would work which defeated the purpose of doing it in-place. Results:

split1            5.07/s     --    -82%      -87% -99%  -99%   -99%   
+    -99%     -100%  -100% -100%          -100%             -100%
substr1           27.8/s   449%      --      -31% -96%  -96%   -97%   
+    -97%      -98%   -98%  -99%           -99%              -99%
ikegami_s         40.3/s   695%     45%        -- -94%  -95%   -95%   
+    -96%      -97%   -98%  -98%           -99%              -99%
avar               700/s 13700%   2414%     1636%   --  -10%   -18%   
+    -30%      -42%   -62%  -70%           -80%              -86%
avar2              778/s 15243%   2695%     1830%  11%    --    -8%   
+    -22%      -35%   -58%  -67%           -78%              -85%
corion             848/s 16635%   2948%     2005%  21%    9%     --   
+    -15%      -29%   -54%  -63%           -76%              -83%
ikegami_tr         999/s 19609%   3490%     2379%  43%   28%    18%   
+      --      -17%   -45%  -57%           -72%              -80%
avar2_pos         1201/s 23591%   4215%     2880%  72%   54%    42%   
+     20%        --   -34%  -48%           -66%              -76%
moritz            1832/s 36032%   6481%     4445% 162%  135%   116%   
+     83%       53%     --  -21%           -49%              -64%
swap              2324/s 45742%   8250%     5666% 232%  199%   174%   
+    133%       93%    27%    --           -35%              -54%
avar_c_inplace    3567/s 70267%  12717%     8751% 410%  359%   320%   
+    257%      197%    95%   53%             --              -30%
avar2_pos_inplace 5063/s 99781%  18093%    12464% 624%  551%   497%   
+    407%      322%   176%  118%            42%                --
[download]

benchmark.pl:

#!/usr/bin/perl

use 5.6.0;

use strict;
use warnings FATAL => 'all';

use Benchmark qw( cmpthese );

use Inline C => q[
void avar_c_inplace(SV *sv1, SV *sv2)
{
    char *sv1p, *sv2p;
    STRLEN sv1len, sv2len;
    SV *sv1_sv;
    STRLEN i;

    if (!SvROK(sv1) || !SvPOK((SV*)SvRV(sv1)) || !SvPOK(sv2)) {
        croak("Usage: avar_cee(\$s1, $s2)");
    }

    sv1_sv = (SV*)SvRV(sv1);
    sv1p = SvPV(sv1_sv, sv1len);
    sv2p = SvPV(sv2,    sv2len);

    if (sv1len != sv2len) {
       croak("The given strings must be of the same length");
    }

    for (i = 0; i < sv2len; i++) {
        if (sv1p[i] == '\0') {
            sv1p[i] = sv2p[i];
        }
    }
}

SV* swap(SV* one, SV* two) {
    char *buf1, *buf2, *buf3;
    STRLEN idx, len;
    SV* ret;

    if (!SvPOK(one) || !SvPOK(two) || sv_len(one) > sv_len(two))
        return newSVsv(&PL_sv_undef);

    len = sv_len(one);
    buf1 = SvPVX(one);
    buf2 = SvPVX(two);
    buf3 = malloc(len);

    for (idx=0; idx < len; idx++) {
        buf3[idx] = buf1[idx] ? buf1[idx] : buf2[idx];
    }

    ret = newSVpv(buf3, len);
    free(buf3);
    return ret;
}
];

my $s1 = do_rand(0, 100_000);
my $s2 = do_rand(1, 100_000);
#my $s1 = "a\0b\0c\0d\0";
#my $s2 = "aXbYcZdA";

my @z = $s1 =~ m/(\0)/g;
my $num = @z;
warn "\$s1 has $num zeroes";

my $cp0 = $s1;
my $cp1 = $s1;

my %subs = (
    'split1'     => sub { my $s3 = split1( $s1, $s2 ) },
    'substr1'    => sub { my $s3 = substr1( $s1, $s2 ) },
#    'kyle'       => sub { my $s3 = kyle( $s1, $s2 ) },
    'moritz'     => sub { my $s3 = moritz( $s1, $s2 ) },
    'corion'     => sub { my $s3 = corion( $s1, $s2 ) },
    'ikegami_s'  => sub { my $s3 = ikegami_s( $s1, $s2 ) },
    'ikegami_tr' => sub { my $s3 = ikegami_tr( $s1, $s2 ) },
    'avar'       => sub { my $s3 = avar( $s1, $s2 ) },
    'avar2'      => sub { my $s3 = avar2( $s1, $s2 ) },
    'avar2_pos'  => sub { my $s3 = avar2_pos( $s1, $s2 ) },
    'avar2_pos_inplace'  => sub { avar2_pos_inplace( \$cp0, $s2 ); $cp
+0 },
    'avar_c_inplace'  => sub { avar_c_inplace( \$cp1, $s2 ); $cp1 },
    'swap'  => sub { swap( $s1, $s2 ) },
#    'bogus'     => sub { "oh noes" },
);

cmpthese( -2, \%subs );

use Test::More;

plan 'tests' => scalar keys %subs;

my $correct;
{
    my $tmp = $s1;
    avar2_pos_inplace(\$tmp, $s2);
    $correct = $tmp;
}

foreach my $subname ( keys %subs ) {
    my $sub = $subs{$subname};
    ok($sub->() eq $correct, "$subname returned the correct value");
}

sub split1 {
    my ($s1, $s2) = @_;

    my @s1 = split //, $s1;
    my @s2 = split //, $s2;

    foreach my $idx ( 0 .. $#s1 ) { 
    if ( $s1[$idx] eq chr(0) ) { 
        $s1[$idx] = $s2[$idx];
    } 
    } 

    return join '', @s1;
}

sub substr1 {
    my ($s1, $s2) = @_;

    for my $idx ( 0 .. length($s1) ) {
    if ( substr($s1,$idx,1) eq chr(0) ) {
        substr($s1, $idx, 1) = substr($s2, $idx, 1);
    }
    } 

    return $s1;
} 

sub kyle {
    my ($s1, $s2) = @_;

    my $out = $s1;
    while ( $s1 =~ m/\000/g ) {
    my $pos = pos;
    substr( $out, $pos, 1 ) = substr( $s2, $pos, 1 );
    }

    return $out;
}

sub moritz {
    my ($s1, $s2) = @_;

    my $pos = 0;
    while ( 0 < ( $pos = index $s1, "\000", $pos ) ) {
    substr( $s1, $pos, 1 ) = substr( $s2, $pos, 1 );
    }

    return $s1;
}

sub ikegami_s {
    my ($s1, $s2) = @_;
    (my $mask = $s1) =~ s/[^\x00]/\xFF/g;
    return ($s1 & $mask) | ($s2 & ~$mask);
}

sub ikegami_tr {
   my ($s1, $s2) = @_;
   (my $mask = $s1) =~ tr/\x00/\xFF/c;
   return ($s1 & $mask) | ($s2 & ~$mask);
}

sub corion {
   my ($s1, $s2) = @_;

   my $ofs = 0;
   return join "", map { $ofs += length; $_ => substr $s2, $ofs++, 1 }
+ split /\0/, $s1, -1;
}

sub avar {
    my ($s1, $s2) = @_;

    my $s3 = $s1;

    {
        use bytes;
        $s3 =~ s/(\0)/substr $s2, $+[0]-1, 1/eg;
    }

    $s3;
}


sub avar2 {
    my ($s1, $s2) = @_;

    use bytes;
    $s1 =~ s/(\0)/substr $s2, $+[0]-1, 1/eg;

    return $s1;
}

sub avar2_pos {
    my ($s1, $s2) = @_;

    use bytes;
    $s1 =~ s/\0/substr $s2, pos($s1), 1/eg;

    return $s1;
}

sub avar2_pos_inplace {
    my ($s1, $s2) = @_;

    use bytes;
    $$s1 =~ s/\0/substr $s2, pos($$s1), 1/eg;
}

sub avar2_pos_inplace2 {
    my ($s1, $s2) = @_;

    use bytes;
    $$s1 =~ s/\0/substr $$s2, pos($$s1), 1/eg;
}

# This makes sure that $s1 has chr(0)'s in it and $s2 does not.
sub do_rand {
    my $min = shift;
    my $len = shift;
    my $n = "";
    for (1 .. $len)
    {
        $n .= chr( rand(255-$min)+$min );
    }
    return $n;
}

#sub do_rand {
#    my $n = (shift) ? int(rand(255)) : int(rand(254)) + 1;
#    return chr( $n );
#}

__END__
[download]

Comment on Re^2: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer) Select or Download Code

Replies are listed 'Best First'.
Re^3: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer) (addr math) by tye (Sage) on Sep 12, 2007 at 17:39 UTC
You are calculating array offsets over and over. The fast way to do this in C is more like: `int len; char* pBeg= SvPV(...,len); char* pSrc= pBeg + len; char* pDst= SvPV(....) + len; while( pBeg <= pSrc ) { if( ! pDst ) { pDst= *pSrc; } --pSrc; --pDst; }` [download] - tye	[reply] [d/l]
Re^4: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer) (addr math) by diotalevi (Canon) on Sep 13, 2007 at 01:24 UTC
I'd already coded up an XS version and used memchr for my search instead of your explicit loop. I don't know why but I found that using memchr got a function that was twice as fast. I included the searching part of the code below. Rate avar avar2 ikegami_tr avar2_pos corion morit +z avar2_pos_inplace dio_c2 dio_c avar 109/s -- -6% -18% -40% -51% -70 +% -84% -91% -95% avar2 116/s 6% -- -13% -36% -48% -68 +% -83% -90% -95% ikegami_tr 133/s 22% 15% -- -27% -41% -64 +% -81% -89% -94% avar2_pos 182/s 66% 56% 36% -- -19% -50 +% -74% -84% -91% corion 224/s 105% 93% 68% 23% -- -39 +% -67% -81% -89% moritz 366/s 235% 215% 175% 101% 63% - +- -47% -69% -83% avar2_pos_inplace 686/s 527% 490% 415% 278% 206% 87 +% -- -41% -68% dio_c2 1172/s 971% 908% 779% 545% 422% 220 +% 71% -- -45% dio_c 2118/s 1836% 1722% 1488% 1066% 844% 479 +% 209% 81% -- dio_c2 // Do it while ( dpv < dpv_end ) { if ( ! dpv ) dpv = spv; ++spv; ++dpv; } dio_c // Do it while ( 1 ) { ptr = (char)memchr( ptr, '\0', ptr_end - ptr ); if ( ! ( ptr && ptr < ptr_end ) ) break; ptr = (ptr - dpv + spv); ++ ptr; } [download] ⠤⠤ ⠙⠊⠕⠞⠁⠇⠑⠧⠊	[reply] [d/l]
Re^5: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer) (addr math) by tye (Sage) on Sep 13, 2007 at 01:29 UTC
Yes, memchr() and related items usually end up implementing their loop in a single machine-language instruction on most processors. Given the relative infrequency of '\0' bytes, the added complexity of computation when one is found is outweighed by the much more efficient finding of them. - tye	[reply]
Re^3: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer) by diotalevi (Canon) on Sep 13, 2007 at 06:16 UTC
I just tested this snippet on gcc 4.2.1 and it had the same speed on a two-processor, multi-core machine as it did without the OpenMP annotations as well as when it was compiled by gcc 3.whatever. `#pragma omp parallel for for( ; dsvc < dsvc_end; ++dsvc, ++ssvc ) { if ( ! dsvc ) dsvc = ssvc; }` [download] Just to see if things changed if I simplified the for(;;) loop, this also had no effect. Or perhaps the openmp version got slower. It's been about a 1000 calls slower the last few times I tried it. I'm sure that's just because the loop has to do the pointer math all the time instead of just incrementing like the previous version. OpenMP just isn't getting enabled, I think. `/ Do it. / #pragma omp parallel for for( o = 0; o < len; ++o ) { if ( ! ( dsvc + o ) ) (dsvc+o) = (ssvc+o); }` [download] In the below chart I deleted a pile of uninteresting programs and kept only the ones I was testing and things that performed at least as well as my worst candidate. `Rate moritz_inplace dio_openmp swap dio_noopenmp + avar_tye_c_inplace avar_c_inplace dio_c moritz_inplace 1948/s -- -64% -66% -68% + -69% -73% -83% dio_openmp 5344/s 174% -- -7% -13% + -15% -26% -54% swap 5745/s 195% 8% -- -6% + -9% -20% -50% dio_noopenmp 6124/s 214% 15% 7% -- + -3% -15% -47% avar_tye_c_inplace 6286/s 223% 18% 9% 3% + -- -12% -45% avar_c_inplace 7178/s 269% 34% 25% 17% + 14% -- -38% dio_c 11495/s 490% 115% 100% 88% + 83% 60% --` [download] ⠤⠤ ⠙⠊⠕⠞⠁⠇⠑⠧⠊	[reply] [d/l] [select]


Your skill will accomplish what the force of many cannot
	PerlMonks