comment on

Hi vr and all,

This is a parallel version for UNIX and Windows. The lengths piddle is shared using PDL::IO::FastRaw.

Update 1: Construct seqs_c from inside workers to consume lesser memory consumption.
Update 2: Applied vr's bug fix. Plus added 2nd example that runs parallel on Windows.
Update 3: Output not 100% consistent (1e7 diff). Not suited for parallelism due to cache miss. The non-PDL solutions here and here handle cache miss. But not yet here.

9,12c9,12
<  [    616 7532665]  # correct
<  [    613 5649499]
<  [    611 8474249]
<  [    608 6355687]
---
>  [    615 7532665]  # wrong
>  [    612 5649499]
>  [    610 8474249]
>  [    607 6355687]
14c14
<  [    606 9533531]  # correct
---
>  [    605 9533531]  # wrong
[download]

Update 4: Output now 100% consistent, possible with MCE::relay.

UNIX:

use strict;
use warnings;
use feature 'say';

BEGIN {
    # Does not work on Windows unfortunately.
    die "Sorry, this script requires a UNIX based OS, exiting...\n"
        if $^O eq 'MSWin32';
}

use PDL;
use File::Map; # ensure that Perl has File::Map before loading FastRaw
use PDL::IO::FastRaw;
use MCE::Signal '$tmp_dir';
use MCE::Flow;

{
    no warnings 'once'; $PDL::BIGPDL = 1;
    eval q{ PDL::set_autopthread_targ(1) };
}

use List::Util;
BEGIN { *_min = \&List::Util::min;          # collision
        *_max = \&List::Util::max }         # with PDL

use constant MAX    => shift || 1e7;
use constant TOP    => _min( 20, MAX );
use constant CHUNK  => _min( 40000, MAX );  # but keep it even
use constant MAXLEN => MAX * 1;             # ?? # x(1..2)

use Time::HiRes 'time';

# create a raw file for lengths
writefraw( ones( short, 3 + MAXLEN ), "$tmp_dir/lengths" );

# memory map the raw file
my $lengths = mapfraw( "$tmp_dir/lengths" );
$lengths-> inplace-> setvaltobad( 1 );
$lengths-> set( 1, 1 );
$lengths-> set( 2, 2 );
$lengths-> set( 4, 3 );

my $t = time;

mce_flow_s {
    max_workers => MCE::Util::get_ncpu(),
    chunk_size  => CHUNK + 1,
    bounds_only => 1,
    init_relay  => 1,
},
sub {
    my ( $mce, $chunk_ref, $chunk_id ) = @_;
    my ( $from, $to ) = @{ $chunk_ref };

    my $seqs_c = $from + sequence( longlong, $to - $from + 1 );

    if ( $chunk_id == 1 ) {
        $seqs_c-> setbadat( 0 );
        $seqs_c-> setbadat( 1 );
        $seqs_c-> badvalue( 2 );
    }
    else {
        $seqs_c-> setbadat(  $from % 2 ? 1 : 0 );
        $seqs_c->    slice([ $from % 2 ? 1 : 0, $to - $from, 2 ]) .= 2
+;
        $seqs_c-> badvalue( 2 );
    }

    my $lengths_c = $lengths-> slice([ $from, $to ]);
    my $current   = zeroes( short, nelem( $seqs_c ));

    while ( any $seqs_c-> isgood ) {

        my ( $seqs_c_odd, $current_odd_masked )
            = where( $seqs_c, $current, $seqs_c & 1 );

        $current_odd_masked ++;
        $current ++;
        ( $seqs_c_odd *= 3 ) ++;
        $seqs_c >>= 1;

        my ( $seqs_cap, $lengths_cap, $current_cap )
            = where( $seqs_c, $lengths_c, $current,
                $seqs_c <= MAXLEN );

        my $lut = $lengths-> index( $seqs_cap );

        # "_f" is for "finished"

        my ( $seqs_f, $lengths_f, $lut_f, $current_f )
            = where( $seqs_cap, $lengths_cap, $lut, $current_cap,
                $lut-> isgood );

        $lengths_f .= $lut_f + $current_f;
        $seqs_f    .= 2;                    # i.e. BAD
    }

    # "_e" is for "at even positions, ahead"                    ##
                                                                ##
#   my $from_e = _max( $from * 2, $to ) + 2;        # bug       ##
    my $from_e = $from == 0 ? $to + 2 : $from * 2;  # fixed     ##
    my $to_e   = _min( $to * 2, MAXLEN );                       ##
                                                                ##
    MCE::relay {                                                ##
      ( $lengths-> slice([ $from_e, $to_e, 2 ])                 ##
          .= $lengths-> slice([ $from_e / 2, $to_e / 2 ])) ++   ##
              if $from_e <= MAXLEN;                             ##
    };                                                          ##

}, 0, MAX;

MCE::Flow->finish;
say {*STDERR} "compute time: ", time - $t;

# same finale

$lengths-> badflag( 0 );
$lengths = $lengths-> slice([ 1, MAX ]);

my $sorted_i = $lengths-> qsorti;
my $sorted   = $lengths-> index( $sorted_i );
my $value    = $sorted-> at( MAX - TOP );
my $pos      = vsearch_insert_leftmost( $value, $sorted );
my $top_i    = $sorted_i-> slice([ MAX - 1, $pos ]);

( my $result = $lengths
    -> index( $top_i )
    -> longlong
    -> bitnot
    -> cat( $top_i + 1 )
    -> transpose
    -> qsortvec
    -> slice([], [ 0, TOP - 1 ])

)-> slice([ 0 ], [])
 -> inplace
 -> bitnot;

say $result;
say {*STDERR} "total time: ", time - $t;
[download]

Windows:

The following script works on Windows with up to 8 workers. Specifying higher than 8 workers causes PDL to emit, "PDL::Internal Error: data structure recursion limit exceeded (max 1000 levels)". I also tested on Linux. No problems there including 32 workers.

use strict;
use warnings;
use feature 'say';

use PDL;
use File::Map; # ensure that Perl has File::Map before loading FastRaw
use PDL::IO::FastRaw;
use MCE::Signal '$tmp_dir';
use MCE::Flow;

{
    no warnings 'once'; $PDL::BIGPDL = 1;
    eval q{ PDL::set_autopthread_targ(1) };
}

use List::Util;
BEGIN { *_min = \&List::Util::min;          # collision
        *_max = \&List::Util::max }         # with PDL

use constant MAX    => shift || 1e7;
use constant TOP    => _min( 20, MAX );
use constant CHUNK  => _min( 40000, MAX );  # but keep it even
use constant MAXLEN => MAX * 1;             # ?? # x(1..2)

use Time::HiRes 'time';

# create a raw file for lengths
writefraw( ones( short, 3 + MAXLEN ), "$tmp_dir/lengths" );

my $max_workers = $^O eq 'MSWin32' ? 8 : MCE::Util::get_ncpu();
my $t = time;
my $lengths;

MCE::Flow->init(
    max_workers => _min( $max_workers, MCE::Util::get_ncpu() ),
    chunk_size  => CHUNK + 1,
    bounds_only => 1,
    init_relay  => 1,

    user_begin  => sub {
        $lengths = mapfraw( "$tmp_dir/lengths" );
        if ( MCE->wid == 1 ) {
            $lengths-> inplace-> setvaltobad( 1 );
            $lengths-> set( 1, 1 );
            $lengths-> set( 2, 2 );
            $lengths-> set( 4, 3 );
        }
        MCE->sync;
    },
);

mce_flow_s sub {
    my ( $mce, $chunk_ref, $chunk_id ) = @_;
    my ( $from, $to ) = @{ $chunk_ref };

    my $seqs_c = $from + sequence( longlong, $to - $from + 1 );

    if ( $chunk_id == 1 ) {
        $seqs_c-> setbadat( 0 );
        $seqs_c-> setbadat( 1 );
        $seqs_c-> badvalue( 2 );
    }
    else {
        $seqs_c-> setbadat(  $from % 2 ? 1 : 0 );
        $seqs_c->    slice([ $from % 2 ? 1 : 0, $to - $from, 2 ]) .= 2
+;
        $seqs_c-> badvalue( 2 );
    }

    my $lengths_c = $lengths-> slice([ $from, $to ]);
    my $current   = zeroes( short, nelem( $seqs_c ));

    while ( any $seqs_c-> isgood ) {

        my ( $seqs_c_odd, $current_odd_masked )
            = where( $seqs_c, $current, $seqs_c & 1 );

        $current_odd_masked ++;
        $current ++;
        ( $seqs_c_odd *= 3 ) ++;
        $seqs_c >>= 1;

        my ( $seqs_cap, $lengths_cap, $current_cap )
            = where( $seqs_c, $lengths_c, $current,
                $seqs_c <= MAXLEN );

        my $lut = $lengths-> index( $seqs_cap );

        # "_f" is for "finished"

        my ( $seqs_f, $lengths_f, $lut_f, $current_f )
            = where( $seqs_cap, $lengths_cap, $lut, $current_cap,
                $lut-> isgood );

        $lengths_f .= $lut_f + $current_f;
        $seqs_f    .= 2;                    # i.e. BAD
    }

    # "_e" is for "at even positions, ahead"                    ##
                                                                ##
#   my $from_e = _max( $from * 2, $to ) + 2;        # bug       ##
    my $from_e = $from == 0 ? $to + 2 : $from * 2;  # fixed     ##
    my $to_e   = _min( $to * 2, MAXLEN );                       ##
                                                                ##
    MCE::relay {                                                ##
      ( $lengths-> slice([ $from_e, $to_e, 2 ])                 ##
          .= $lengths-> slice([ $from_e / 2, $to_e / 2 ])) ++   ##
              if $from_e <= MAXLEN;                             ##
    };                                                          ##

}, 0, MAX;

MCE::Flow->finish;
say {*STDERR} "compute time: ", time - $t;

# same finale

$lengths = mapfraw( "$tmp_dir/lengths" );

$lengths-> badflag( 0 );
$lengths = $lengths-> slice([ 1, MAX ]);

my $sorted_i = $lengths-> qsorti;
my $sorted   = $lengths-> index( $sorted_i );
my $value    = $sorted-> at( MAX - TOP );
my $pos      = vsearch_insert_leftmost( $value, $sorted );
my $top_i    = $sorted_i-> slice([ MAX - 1, $pos ]);

( my $result = $lengths
    -> index( $top_i )
    -> longlong
    -> bitnot
    -> cat( $top_i + 1 )
    -> transpose
    -> qsortvec
    -> slice([], [ 0, TOP - 1 ])

)-> slice([ 0 ], [])
 -> inplace
 -> bitnot;

say $result;
say {*STDERR} "total time: ", time - $t;
[download]

Results:

Compute time excludes initial piddle creation and final sorting/output as these run serially. Total time is captured by running time perl script.pl.

1e7:
    serial  total 15.272s, compute 14.674s   1 core
  parallel  total  8.327s, compute  7.478s   2 cores
  parallel  total  4.726s, compute  3.912s   4 cores
  parallel  total  2.855s, compute  2.025s   8 cores
  parallel  total  1.911s, compute  1.085s  16 cores
  parallel  total  1.484s, compute  0.835s  32 cores

[
 [    686 8400511]
 [    668 8865705]
 [    665 6649279]
 [    663 9973919]
 [    621 6674175]
 [    616 7332399]
 [    616 7532665]
 [    613 5649499]
 [    611 8474249]
 [    608 6355687]
 [    606 8847225]
 [    606 9533531]
 [    603 6635419]
 [    601 9953129]
 [    598 7464846]
 [    598 7464847]
 [    597 3732423]
 [    595 5598635]
 [    593 8397953]
 [    590 6298465]
]

1e8:
  parallel  total 11.779s, compute  5.631s  32 cores
[download]

Regards, Mario

In reply to Re^4: Optimizing with Caching vs. Parallelizing (MCE::Map) (PDL: faster) by marioroy
in thread Optimizing with Caching vs. Parallelizing (MCE::Map) by 1nickt

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


Don't ask to ask, just ask
	PerlMonks