Beefy Boxes and Bandwidth Generously Provided by pair Networks
Welcome to the Monastery
 
PerlMonks  

comment on

( #3333=superdoc: print w/replies, xml ) Need Help??

Hi vr and all,

This is a parallel version for UNIX and Windows. The lengths piddle is shared using PDL::IO::FastRaw.

Update 1: Construct seqs_c from inside workers to consume lesser memory consumption.
Update 2: Applied vr's bug fix. Plus added 2nd example that runs parallel on Windows.

Update 3: Output not 100% consistent (1e7 diff). Not suited for parallelism due to cache miss. The non-PDL solutions here and here handle cache miss. But not yet here.

9,12c9,12 < [ 616 7532665] # correct < [ 613 5649499] < [ 611 8474249] < [ 608 6355687] --- > [ 615 7532665] # wrong > [ 612 5649499] > [ 610 8474249] > [ 607 6355687] 14c14 < [ 606 9533531] # correct --- > [ 605 9533531] # wrong

UNIX:

use strict; use warnings; use feature 'say'; BEGIN { # Does not work on Windows unfortunately. die "Sorry, this script requires a UNIX based OS, exiting...\n" if $^O eq 'MSWin32'; } use PDL; use File::Map; # ensure that Perl has File::Map before loading FastRaw use PDL::IO::FastRaw; use MCE::Signal '$tmp_dir'; use MCE::Flow; { no warnings 'once'; $PDL::BIGPDL = 1; } use List::Util; BEGIN { *_min = \&List::Util::min; # collision *_max = \&List::Util::max } # with PDL use constant MAX => shift || 1e7; use constant TOP => _min( 20, MAX ); use constant CHUNK => _min( 40000, MAX ); # but keep it even use constant MAXLEN => MAX * 1; # ?? # x(1..2) use Time::HiRes 'time'; # create a raw file for lengths writefraw( ones( short, 3 + MAXLEN ), "$tmp_dir/lengths" ); # memory map the raw file my $lengths = mapfraw( "$tmp_dir/lengths" ); $lengths-> inplace-> setvaltobad( 1 ); $lengths-> set( 1, 1 ); $lengths-> set( 2, 2 ); $lengths-> set( 4, 3 ); my $t = time; mce_flow_s { max_workers => MCE::Util::get_ncpu(), chunk_size => CHUNK + 1, bounds_only => 1, }, sub { my ( $mce, $chunk_ref, $chunk_id ) = @_; my ( $from, $to ) = @{ $chunk_ref }; my $seqs_c = $from + sequence( longlong, $to - $from + 1 ); if ( $chunk_id == 1 ) { $seqs_c-> setbadat( 0 ); $seqs_c-> setbadat( 1 ); $seqs_c-> badvalue( 2 ); } else { $seqs_c-> setbadat( $from % 2 ? 1 : 0 ); $seqs_c-> slice([ $from % 2 ? 1 : 0, $to - $from, 2 ]) .= 2 +; $seqs_c-> badvalue( 2 ); } my $lengths_c = $lengths-> slice([ $from, $to ]); my $current = zeroes( short, nelem( $seqs_c )); while ( any $seqs_c-> isgood ) { my ( $seqs_c_odd, $current_odd_masked ) = where( $seqs_c, $current, $seqs_c & 1 ); $current_odd_masked ++; $current ++; ( $seqs_c_odd *= 3 ) ++; $seqs_c >>= 1; my ( $seqs_cap, $lengths_cap, $current_cap ) = where( $seqs_c, $lengths_c, $current, $seqs_c <= MAXLEN ); my $lut = $lengths-> index( $seqs_cap ); # "_f" is for "finished" my ( $seqs_f, $lengths_f, $lut_f, $current_f ) = where( $seqs_cap, $lengths_cap, $lut, $current_cap, $lut-> isgood ); $lengths_f .= $lut_f + $current_f; $seqs_f .= 2; # i.e. BAD } # "_e" is for "at even positions, ahead" ## ## # my $from_e = _max( $from * 2, $to ) + 2; # bug ## my $from_e = $from == 0 ? $to + 2 : $from * 2; # fixed ## my $to_e = _min( $to * 2, MAXLEN ); ## ## ( $lengths-> slice([ $from_e, $to_e, 2 ]) ## .= $lengths-> slice([ $from_e / 2, $to_e / 2 ])) ++ ## if $from_e <= MAXLEN; ## }, 0, MAX; MCE::Flow->finish; say {*STDERR} "compute time: ", time - $t; # same finale $lengths-> badflag( 0 ); $lengths = $lengths-> slice([ 1, MAX ]); my $sorted_i = $lengths-> qsorti; my $sorted = $lengths-> index( $sorted_i ); my $value = $sorted-> at( MAX - TOP ); my $pos = vsearch_insert_leftmost( $value, $sorted ); my $top_i = $sorted_i-> slice([ MAX - 1, $pos ]); ( my $result = $lengths -> index( $top_i ) -> longlong -> bitnot -> cat( $top_i + 1 ) -> transpose -> qsortvec -> slice([], [ 0, TOP - 1 ]) )-> slice([ 0 ], []) -> inplace -> bitnot; say $result; say {*STDERR} "total time: ", time - $t;

Windows:

The following script works on Windows with up to 8 workers. Specifying higher than 8 workers causes PDL to emit, "PDL::Internal Error: data structure recursion limit exceeded (max 1000 levels)". I also tested on Linux. No problems there including 32 workers.

use strict; use warnings; use feature 'say'; use PDL; use File::Map; # ensure that Perl has File::Map before loading FastRaw use PDL::IO::FastRaw; use MCE::Signal '$tmp_dir'; use MCE::Flow; { no warnings 'once'; $PDL::BIGPDL = 1; } use List::Util; BEGIN { *_min = \&List::Util::min; # collision *_max = \&List::Util::max } # with PDL use constant MAX => shift || 1e7; use constant TOP => _min( 20, MAX ); use constant CHUNK => _min( 40000, MAX ); # but keep it even use constant MAXLEN => MAX * 1; # ?? # x(1..2) use Time::HiRes 'time'; # create a raw file for lengths writefraw( ones( short, 3 + MAXLEN ), "$tmp_dir/lengths" ); my $t = time; my $lengths; MCE::Flow->init( max_workers => _min( 8, MCE::Util::get_ncpu() ), chunk_size => CHUNK + 1, bounds_only => 1, init_relay => 1, user_begin => sub { $lengths = mapfraw( "$tmp_dir/lengths" ); if ( MCE->wid == 1 ) { $lengths-> inplace-> setvaltobad( 1 ); $lengths-> set( 1, 1 ); $lengths-> set( 2, 2 ); $lengths-> set( 4, 3 ); } MCE->sync; }, ); mce_flow_s sub { my ( $mce, $chunk_ref, $chunk_id ) = @_; my ( $from, $to ) = @{ $chunk_ref }; my $seqs_c = $from + sequence( longlong, $to - $from + 1 ); if ( $chunk_id == 1 ) { $seqs_c-> setbadat( 0 ); $seqs_c-> setbadat( 1 ); $seqs_c-> badvalue( 2 ); } else { $seqs_c-> setbadat( $from % 2 ? 1 : 0 ); $seqs_c-> slice([ $from % 2 ? 1 : 0, $to - $from, 2 ]) .= 2 +; $seqs_c-> badvalue( 2 ); } my $lengths_c = $lengths-> slice([ $from, $to ]); my $current = zeroes( short, nelem( $seqs_c )); while ( any $seqs_c-> isgood ) { my ( $seqs_c_odd, $current_odd_masked ) = where( $seqs_c, $current, $seqs_c & 1 ); $current_odd_masked ++; $current ++; ( $seqs_c_odd *= 3 ) ++; $seqs_c >>= 1; my ( $seqs_cap, $lengths_cap, $current_cap ) = where( $seqs_c, $lengths_c, $current, $seqs_c <= MAXLEN ); my $lut = $lengths-> index( $seqs_cap ); # "_f" is for "finished" my ( $seqs_f, $lengths_f, $lut_f, $current_f ) = where( $seqs_cap, $lengths_cap, $lut, $current_cap, $lut-> isgood ); $lengths_f .= $lut_f + $current_f; $seqs_f .= 2; # i.e. BAD } # "_e" is for "at even positions, ahead" ## ## # my $from_e = _max( $from * 2, $to ) + 2; # bug ## my $from_e = $from == 0 ? $to + 2 : $from * 2; # fixed ## my $to_e = _min( $to * 2, MAXLEN ); ## ## MCE::relay { ## ( $lengths-> slice([ $from_e, $to_e, 2 ]) ## .= $lengths-> slice([ $from_e / 2, $to_e / 2 ])) ++ ## if $from_e <= MAXLEN; ## }; }, 0, MAX; MCE::Flow->finish; say {*STDERR} "compute time: ", time - $t; # same finale $lengths = mapfraw( "$tmp_dir/lengths" ); $lengths-> badflag( 0 ); $lengths = $lengths-> slice([ 1, MAX ]); my $sorted_i = $lengths-> qsorti; my $sorted = $lengths-> index( $sorted_i ); my $value = $sorted-> at( MAX - TOP ); my $pos = vsearch_insert_leftmost( $value, $sorted ); my $top_i = $sorted_i-> slice([ MAX - 1, $pos ]); ( my $result = $lengths -> index( $top_i ) -> longlong -> bitnot -> cat( $top_i + 1 ) -> transpose -> qsortvec -> slice([], [ 0, TOP - 1 ]) )-> slice([ 0 ], []) -> inplace -> bitnot; say $result; say {*STDERR} "total time: ", time - $t;

Results:

Compute time excludes initial piddle creation and final sorting/output as these run serially. Total time is captured by running time perl script.pl.

1e7: serial total 15.272s, compute 14.674s 1 core parallel total 8.327s, compute 7.478s 2 cores parallel total 4.726s, compute 3.912s 4 cores parallel total 2.855s, compute 2.025s 8 cores parallel total 1.911s, compute 1.085s 16 cores parallel total 1.484s, compute 0.835s 32 cores [ [ 686 8400511] [ 668 8865705] [ 665 6649279] [ 663 9973919] [ 621 6674175] [ 616 7332399] [ 616 7532665] [ 613 5649499] [ 611 8474249] [ 608 6355687] [ 606 8847225] [ 606 9533531] [ 603 6635419] [ 601 9953129] [ 598 7464846] [ 598 7464847] [ 597 3732423] [ 595 5598635] [ 593 8397953] [ 590 6298465] ] 1e8: parallel total 11.779s, compute 5.631s 32 cores

Regards, Mario


In reply to Re^4: Optimizing with Caching vs. Parallelizing (MCE::Map) (PDL: faster) by marioroy
in thread Optimizing with Caching vs. Parallelizing (MCE::Map) by 1nickt

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others drinking their drinks and smoking their pipes about the Monastery: (3)
    As of 2021-01-26 05:21 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?
      Notices?