http://qs321.pair.com?node_id=11115363


in reply to Re^2: Optimizing with Caching vs. Parallelizing (MCE::Map) (Better caching?)
in thread Optimizing with Caching vs. Parallelizing (MCE::Map)

Hi,

There was one thing left to try and that is using Inline::C. I captured the top 20 for 1e9 and 1e10.

Update 1: Set chunk_size to reduce IPC. Tried staying within CPU cache by keeping @ret small. 1e10 now completes in 4 minutes (20% improvement).

Update 2: Added collatz_count_c2 using GCC compiler intrinsics.

use strict; use warnings; use feature 'say'; use MCE::Flow; use Inline C => Config => CCFLAGSEX => '-O2 -fomit-frame-pointer', clean_after_build => 0; use Inline C => <<'END_OF_C_CODE'; #include <stdlib.h> #include <stdint.h> #if defined(_WIN32) #define strtoull _strtoui64 #endif void collatz_count_c1( SV* _n ) { uint64_t n, steps = 1; /* include 1 */ Inline_Stack_Vars; #ifdef __LP64__ n = SvUV( _n ); #else n = strtoull( SvPV_nolen( _n ), NULL, 10 ); #endif /* count using the T(x) notation */ do { n % 2 ? ( steps += 2, n = (3 * n + 1) >> 1 ) : ( steps += 1, n = n >> 1 ); } while ( n != 1 ); Inline_Stack_Reset; Inline_Stack_Push( sv_2mortal( newSVuv( steps ) ) ); Inline_Stack_Done; } void collatz_count_c2( SV* _n ) { uint64_t n, steps = 1; /* include 1 */ Inline_Stack_Vars; #ifdef __LP64__ n = SvUV( _n ); #else n = strtoull( SvPV_nolen( _n ), NULL, 10 ); #endif /* based on GCC compiler intrinsics demonstration by Alex Shirley +*/ /* https://stackoverflow.com/questions/38114205/c-collatz-conjectu +re-optimization */ /* https://www.go4expert.com/articles/builtin-gcc-functions-builti +nclz-t29238 */ if ( n % 2 == 0 ) { steps += __builtin_ctz(n); /* account for all evens */ n >>= __builtin_ctz(n); /* always returns an odd */ } /* when we enter we're always working on an odd number */ do { n = 3 * n + 1; steps += __builtin_ctz(n) + 1; /* account for odd and even */ n >>= __builtin_ctz(n); /* always returns an odd */ } while ( n != 1 ); Inline_Stack_Reset; Inline_Stack_Push( sv_2mortal( newSVuv( steps ) ) ); Inline_Stack_Done; } END_OF_C_CODE sub collatz_count { my ($n) = @_; my $steps = 1; # count 1 # count using the T(x) notation $n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 ) : ( $steps += 1, $n = $n >> 1 ) while $n != 1; return $steps; } no warnings 'once'; #*collatz = \&collatz_count; # choose collatz here #*collatz = \&collatz_count_c1; # using the T(x) notation *collatz = \&collatz_count_c2; # using compiler intrinsics my $m = shift || 1e6; my ( @sizes, $chunk_size ); $chunk_size = int( $m / MCE::Util::get_ncpu() / 40 + 1 ); $chunk_size += 1 if $chunk_size % 2; MCE::Flow->init( max_workers => MCE::Util::get_ncpu(), chunk_size => $chunk_size, gather => \@sizes, bounds_only => 1, ); mce_flow_s sub { my ( $start, $end ) = @{ $_[1] }; my @ret; for ( $start .. $end ) { push @ret, [ $_, collatz($_) ]; @ret = ( sort { $b->[1] <=> $a->[1] } @ret )[ 0..19 ] if @ret > 100; } ( @ret > 20 ) ? MCE->gather( ( sort { $b->[1] <=> $a->[1] } @ret )[ 0..19 ] +) : MCE->gather( @ret ); }, 1, $m; MCE::Flow->finish; say "@$_" for ( sort { $b->[1] <=> $a->[1] } @sizes )[ 0..19 ];

Output

1..1e9 collatz_count 6m10.877s 32 cores Perl collatz_count_c1 11m20.134s 1 core Inline C collatz_count_c2 8m56.683s 1 core Inline C collatz_count_c1 0m23.443s 32 cores 29.0x collatz_count_c2 0m18.507s 32 cores 29.0x 670617279 987 848749995 977 954843745 972 954843751 972 716132809 969 537099606 966 537099607 966 268549803 965 805649409 964 805649410 964 805649411 964 805649415 964 402824705 963 604237057 961 604237058 961 604237059 961 302118529 960 906355586 959 906355587 959 906355588 959 1..1e10, 32 cores, Inline C collatz_count_c1 4m00.323s collatz_count_c2 3m09.774s 9780657630 1133 9780657631 1133 4890328815 1132 7335493223 1130 6189322407 1122 9283983611 1120 8140184737 1094 6105138553 1091 9157707830 1089 9157707831 1089 4578853915 1088 6868280873 1086 5151210655 1083 7726815983 1081 9282648843 1058 6961986633 1055 5221489974 1052 5221489975 1052 2610744987 1051 7832234962 1050 1..1e11, 32 cores, Inline C collatz_count_c1 40m50.177s collatz_count_c2 32m17.319s 75128138247 1229 84519155529 1224 85833820801 1224 63389366646 1221 63389366647 1221 64375365601 1221 31694683323 1220 95084049969 1219 95084049970 1219 95084049971 1219 96563048402 1219 96563048403 1219 47542024985 1218 48281524201 1218 71313037476 1216 71313037477 1216 71313037478 1216 71313037479 1216 72422286302 1216 72422286303 1216

Look at C go!

Regards, Mario