comment on

There is one more improvement for your algorithm, Limbic~Region. Instead of a simple insertion sort on the top list, you could do a binary insertion sort. This starts to pay off as the value N increases. I made a new benchmark with topN (by BrowserUk, with a small fix I added to put back the first short-circuit test), topNbs (based on the same code but with a binary search to find the insert point), aristotle's method, BrowserUk's method, and the original limbic method:

#!/usr/bin/perl
use strict;
use warnings;

use List::Util qw( reduce );
use Benchmark qw( cmpthese );
use Algorithm::Numerical::Shuffle qw( shuffle );
use Inline C => 'DATA';

my %code = (
        topN => sub {
                my ( $n, $list ) = @_;
                return topN($n, $list);
        },
        topNbs => sub {
                my ( $n, $list ) = @_;
                return topNbs($n, $list);
        },
        baseline => sub {
        my ( $n, $list ) = @_;
        return ( sort { $a <=> $b } @$list )[ @$list - $n .. $#$list ]
+;
    },

    limbic => sub {
        my ($x, $list) = @_;
        $x--;
        my @top;
        $#top = $x;
        for my $item ( @$list ) {
            next if defined $top[ -1 ] && $item <= $top[ -1 ];
            for my $id ( 0 .. $#top ) {
                $top[ $id ] = $item and last if ! defined $top[ $id ];
                if ( $item > $top[ $id ] ) {
                    @top[ $id .. $#top ] = ($item, @top[ $id .. $#top 
+- 1]);
                    last;
                }
            }
        }
        return @top;
    },
    browseruk => sub {
        my( $n, $aref ) = @_;
        my @topN;
        push @topN, reduce{ 
            $a > $b && (!@topN || $a < $topN[ -1 ] )
            ? $a : ( !@topN || $b < $topN[ -1 ] )
            ? $b : $a;
        } @$aref for 1 .. $n;
        return @topN;
    },
    aristotle => sub {
        my ( $n, $list ) = @_;
        my @top = @$list[ 0 .. $n - 1 ];
        @top = ( sort { $a <=> $b } $_, @top )[ 1 .. $n ] for @$list[ 
+$n .. $#$list ];
        return @top;
    },
);

my @bench = (
    [ qw/ 10 5 / ],
    [ qw/ 100 5 / ],
    [ qw/ 1000 5 / ],
    [ qw/ 10000 5 / ],
    [ qw/ 100000 5 / ],
    [ qw/ 100 50 / ],
    [ qw/ 1000 50 / ],
    [ qw/ 10000 50 / ],
    [ qw/ 100000 50 / ],
    [ qw/ 1000 500 / ],
    [ qw/ 10000 500 / ],
    [ qw/ 100000 500 / ],
);

$|++;

while( @bench ) {
    my ( $max, $n ) = @{ shift @bench };

    my $duration = sprintf "%.2g", ( log( $max ) / log( 10 ) ) ** 2;

    print "\nLooking for top $n in $max (running for $duration CPU sec
+s)\n";

    my @values = 1 .. $max; 
        my @values_mixed = shuffle(@values);

    my @top = ( sort { $a <=> $b } @values )[ @values - $n .. $#values
+ ];

    for( keys %code ) {
        my @result = sort { $a <=> $b } $code{ $_ }->( $n, \@values_mi
+xed );
        die "$_ not ok: [@result] ne [@top]\n" if "@result" ne "@top";
    }

    cmpthese -$duration => {
        map { my $x = $code{ $_ }; $_ => sub { my @x = $x->( $n, \@val
+ues_mixed ) } } keys %code
    };
}

__END__
__C__

void topN( int n, AV*data ) {
    int *topN;
    int len = av_len( data );
    int i, j, k;

    Inline_Stack_Vars;
    Newz( 1, topN, n + 1, int );
    
    for( i = 0; i <= len; i++ )    {
        int val  = SvIV( *av_fetch( data, i, 0 ) );
        if (val <= topN[ n - 1]) continue;
        for( j = 0; j < n; j++ ) {
            if( topN[ j ] < val ) {
                for( k = n; k > j; k-- ) topN[ k ] = topN[ k-1 ];
                topN[ j ] = val;
                break;
            }
        }
    }
    
    Inline_Stack_Reset;
    for( i = 0; i < n; i++ )
        Inline_Stack_Push( sv_2mortal( newSViv( topN[ i ] ) ) );
    
    Safefree( topN );
    Inline_Stack_Done;
}

void topNbs( int n, AV*data ) {
    int *topN;
    int len = av_len( data );
    int i, j, k;
    int left, right;
    Inline_Stack_Vars;
    Newz( 1, topN, n + 1, int );

    for( i = 0; i <= len; i++ )    {
        int val  = SvIV( *av_fetch( data, i, 0 ) );
        if (val <= topN[ n - 1]) continue;
        left = 0;
        right = n - 1;
        while (left < right) {
           int middle = (left + right) >> 1;
           if (val <= topN[middle]) {
              left = middle + 1;
           } else {
              right = middle;
           }
        }
        for( k = n; k > left; k-- ) topN[ k ] = topN[ k-1 ];
        topN[ left ] = val;
    }

    Inline_Stack_Reset;
    for( i = 0; i < n; i++ ) 
        Inline_Stack_Push( sv_2mortal( newSViv( topN[ i ] ) ) );
    Safefree( topN );
    Inline_Stack_Done;
}
[download]

Here are the results:

Looking for top 5 in 10 (running for 1 CPU secs)
              Rate browseruk    limbic aristotle  baseline      topN  
+  topNbs
browseruk  11164/s        --      -38%      -47%      -88%      -93%  
+    -93%
limbic     17935/s       61%        --      -15%      -81%      -89%  
+    -89%
aristotle  21154/s       89%       18%        --      -77%      -87%  
+    -87%
baseline   92789/s      731%      417%      339%        --      -44%  
+    -44%
topN      167020/s     1396%      831%      690%       80%        --  
+     -0%
topNbs    167020/s     1396%      831%      690%       80%        0%  
+      --
 
Looking for top 5 in 100 (running for 4 CPU secs)
             Rate aristotle browseruk    limbic  baseline      topN   
+ topNbs
aristotle  1529/s        --      -35%      -64%      -95%      -98%   
+   -98%
browseruk  2337/s       53%        --      -45%      -92%      -97%   
+   -97%
limbic     4277/s      180%       83%        --      -85%      -94%   
+   -94%
baseline  29337/s     1819%     1155%      586%        --      -59%   
+   -61%
topN      71774/s     4595%     2971%     1578%      145%        --   
+    -4%
topNbs    74879/s     4798%     3104%     1651%      155%        4%   
+     --
 
Looking for top 5 in 1000 (running for 9 CPU secs)
             Rate aristotle browseruk    limbic  baseline      topN   
+ topNbs
aristotle   141/s        --      -45%      -86%      -94%      -99%   
+   -99%
browseruk   258/s       83%        --      -75%      -90%      -98%   
+   -98%
limbic     1034/s      632%      300%        --      -58%      -91%   
+   -91%
baseline   2462/s     1642%      852%      138%        --      -78%   
+   -80%
topN      11362/s     7941%     4296%      999%      362%        --   
+    -6%
topNbs    12050/s     8428%     4562%     1065%      390%        6%   
+     --
 
Looking for top 5 in 10000 (running for 16 CPU secs)
            Rate aristotle browseruk    limbic  baseline      topN    
+topNbs
aristotle 14.7/s        --      -44%      -90%      -92%      -99%    
+  -99%
browseruk 26.3/s       78%        --      -82%      -85%      -98%    
+  -98%
limbic     146/s      889%      454%        --      -19%      -88%    
+  -88%
baseline   179/s     1114%      581%       23%        --      -85%    
+  -86%
topN      1233/s     8267%     4591%      746%      589%        --    
+   -2%
topNbs    1252/s     8396%     4663%      759%      600%        2%    
+    --
 
Looking for top 5 in 100000 (running for 25 CPU secs)
            Rate aristotle browseruk  baseline    limbic      topN    
+topNbs
aristotle 1.45/s        --      -45%      -85%      -90%      -99%    
+  -99%
browseruk 2.65/s       83%        --      -72%      -82%      -98%    
+  -98%
baseline  9.36/s      547%      253%        --      -37%      -92%    
+  -92%
limbic    15.0/s      934%      465%       60%        --      -87%    
+  -88%
topN       114/s     7811%     4221%     1123%      665%        --    
+   -6%
topNbs     121/s     8291%     4483%     1198%      712%        6%    
+    --
 
Looking for top 50 in 100 (running for 4 CPU secs)
             Rate browseruk    limbic aristotle  baseline      topN   
+ topNbs
browseruk   222/s        --      -30%      -46%      -98%      -99%   
+   -99%
limbic      316/s       43%        --      -23%      -98%      -99%   
+   -99%
aristotle   410/s       85%       30%        --      -97%      -98%   
+   -98%
baseline  13303/s     5900%     4108%     3142%        --      -47%   
+   -51%
topN      25099/s    11221%     7839%     6017%       89%        --   
+    -7%
topNbs    26991/s    12074%     8438%     6478%      103%        8%   
+     --
 
Looking for top 50 in 1000 (running for 9 CPU secs)
            Rate aristotle browseruk    limbic  baseline      topN    
+topNbs
aristotle 22.9/s        --       -8%      -79%      -99%     -100%    
+ -100%
browseruk 24.8/s        8%        --      -78%      -99%     -100%    
+ -100%
limbic     111/s      383%      347%        --      -95%      -99%    
+  -99%
baseline  2237/s     9667%     8923%     1920%        --      -71%    
+  -73%
topN      7656/s    33333%    30785%     6816%      242%        --    
+   -7%
topNbs    8259/s    35965%    33216%     7360%      269%        8%    
+    --
 
Looking for top 50 in 10000 (running for 16 CPU secs)
            Rate aristotle browseruk    limbic  baseline      topN    
+topNbs
aristotle 2.17/s        --      -13%      -96%      -99%     -100%    
+ -100%
browseruk 2.48/s       14%        --      -95%      -99%     -100%    
+ -100%
limbic    51.6/s     2279%     1981%        --      -71%      -95%    
+  -96%
baseline   178/s     8105%     7079%      245%        --      -84%    
+  -85%
topN      1108/s    50993%    44607%     2048%      523%        --    
+   -7%
topNbs    1190/s    54771%    47912%     2207%      569%        7%    
+    --
 
Looking for top 50 in 100000 (running for 25 CPU secs)
             Rate aristotle browseruk  baseline    limbic      topN   
+ topNbs
aristotle 0.213/s        --      -15%      -98%      -98%     -100%   
+  -100%
browseruk 0.251/s       18%        --      -97%      -98%     -100%   
+  -100%
baseline   9.47/s     4343%     3667%        --      -20%      -92%   
+   -92%
limbic     11.9/s     5465%     4619%       25%        --      -89%   
+   -90%
topN        113/s    52713%    44686%     1089%      849%        --   
+    -8%
topNbs      122/s    57162%    48458%     1189%      929%        8%   
+     --
 
Looking for top 500 in 1000 (running for 9 CPU secs)
            Rate browseruk    limbic aristotle      topN  baseline    
+topNbs
browseruk 2.43/s        --      -33%      -42%     -100%     -100%    
+ -100%
limbic    3.61/s       49%        --      -13%     -100%     -100%    
+ -100%
aristotle 4.15/s       71%       15%        --      -99%     -100%    
+ -100%
topN       822/s    33807%    22679%    19725%        --      -26%    
+  -48%
baseline  1118/s    45985%    30860%    26846%       36%        --    
+  -29%
topNbs    1567/s    64513%    43307%    37678%       91%       40%    
+    --
 
Looking for top 500 in 10000 (running for 16 CPU secs)
             Rate aristotle browseruk    limbic  baseline      topN   
+ topNbs
aristotle 0.222/s        --       -9%      -84%     -100%     -100%   
+  -100%
browseruk 0.245/s       10%        --      -82%     -100%     -100%   
+  -100%
limbic     1.37/s      519%      461%        --      -99%     -100%   
+  -100%
baseline    163/s    73578%    66635%    11796%        --      -46%   
+   -71%
topN        300/s   135227%   122474%    21750%       84%        --   
+   -46%
topNbs      556/s   250785%   227143%    40408%      241%       85%   
+     --
 
Looking for top 500 in 100000 (running for 25 CPU secs)
            (warning: too few iterations for a reliable count)
            (warning: too few iterations for a reliable count)
            s/iter aristotle browseruk    limbic  baseline      topN  
+  topNbs
aristotle     47.2        --      -13%      -97%     -100%     -100%  
+   -100%
browseruk     40.9       15%        --      -97%     -100%     -100%  
+   -100%
limbic        1.20     3834%     3309%        --      -91%      -99%  
+    -99%
baseline     0.106    44269%    38352%     1028%        --      -88%  
+    -91%
topN      1.25e-02   377296%   326971%     9493%      751%        --  
+    -23%
topNbs    9.59e-03   491984%   426366%    12409%     1009%       30%  
+      --
[download]

As you can see, the topNbs starts to pay off when we need the top 500 or so. For the top 5, topN is better.

Update: I have fixed the redundant lines in topNbs that BrowserUk pointed out, and re-run the benchmarks. Now topNbs does as well or better than topN for all cases. Now, the last thing that would be fun to try is a C-coded heap...

In reply to Re: Better mousetrap (getting top N values from list X) by tall_man
in thread Better mousetrap (getting top N values from list X) by Limbic~Region

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


XP is just a number
	PerlMonks