I was sufficiently frustrated by the discussion in the note above, that I decided to just go ahead and correct the algorithm for situations where there are same sized buckets. The code posted below solves Grandfather's "Nasty distribution" perfectly. It is still O(N) where N is the number of items to allocate.
The algorithm below is a hybrid of the one proposed by Limbic-Region and the one I originally proposed. Smallest buckets are filled first (as per Limbic-Region), but when we encounter N equal sized buckets we allocate items N (one per bucket) at a time until we can't. Then we go back to allocating items to buckets one by one. This prevents the first of N buckets from hogging all the "good" values.
I'd still very much appreciate it if someone could find an example or two that doesn't work with this algorithm. Though I'm not convinced that this problem is anything like NP complete, I'm sure the code I've posted has room for improvement.
use strict;
use warnings;
sub demoAllocation($$$);
demoAllocation
("Distribution: all at mean"
, {a=>1000,b=>2000,c=>3000}
, { '6.0' => 6000 }
);
demoAllocation
("Distribution: unskewed"
, {a=>1000,b=>2000,c=>3000}
, { '3.0' => 300, '4.0' => 600, '5.0' => 700, '5.5' => 900
, '6.0' => 1000
, '6.5' => 900, '7.0' => 700, '8.0' => 600, '9.0' => 300 }
);
demoAllocation
("Distribution: skewed"
, {a=>1000,b=>2000,c=>3000}
, { '3.0' => 4000, '12.0' => 2000 }
);
demoAllocation
("Distribution: skewed: Nasty (Relatively prime)"
, {a=>3, b=>3}
, {'1.0' => 4, '2.0' => 1, '4.0' => 1 }
);
demoAllocation
("Distribution: skewed: Nasty (Grandfather)"
, {a=>30, b=>30}
, {'1.0' => 40, '2.0' => 10, '4.0' => 10 }
);
demoAllocation
("Distribution: Original poster"
, {A=>65, B=>12, C=>24, D=>19, E=>30}
, {'93.8' => 5, '93.81' => 20, '93.82' => 10
, '93.83' => 15, '93.84' => 25, '93.85' => 5
, '93.87'=>20, '94.0' => 5, '94.1' => 35
, '94.2'=> 10 }
);
demoAllocation
("Distribution: camelback: Nasty mark II (Grandfather)/2 "
, { a => 15, b => 10, c => 5 }
, {'1.0' => 15, '2.0' => 6, '4.0' => 3, '8.0' => 6 }
);
demoAllocation
("Distribution: camelback: Nasty mark II (Grandfather)"
, { a => 30, b => 20, c => 10 }
, {'1.0' => 30, '2.0' => 12, '4.0' => 6, '8.0' => 12 }
);
demoAllocation
("Distribution: flat: Limbic~Region"
, { a => 3, b => 4, c => 2, d => 2 }
, {'1.0' => 1, '2.0' => 1, '3.0' => 1, '4.0' => 1
, '5.0' => 1, '6.0' => 1, '7.0' => 1, '8.0' => 1
, '9.0' => 1, '10.0' => 1, '11.0' => 1
}
);
#------------------------------------------------------------
sub demoAllocation($$$) {
my ($sDescription, $hBuckets, $hFrequency) = @_;
print "$sDescription\n";
my ($dAvg, $hAllocation) = allocate($hBuckets, $hFrequency);
foreach my $sId (sort keys %$hAllocation) {
my $hItems = $hAllocation->{$sId};
my $dSum = 0;
my $iCount = 0;
my ($dBucketAvg, $dDeviation);
my $iBucketSize = $hBuckets->{$sId};
print "$sId:";
foreach my $dValue (sort keys %$hItems) {
my $iFreq = $hItems->{$dValue};
printf "\t%s \@ \$%.2f\n", $iFreq, $dValue;
$dSum += $dValue*$iFreq;
$iCount += $iFreq;
}
$dBucketAvg = $dSum/$iCount;
$dDeviation = $dBucketAvg - $dAvg;
if ($iBucketSize != $iCount) {
printf "\t**ERROR**: bucket size: %d, actual allocation: %d\n"
, $iBucketSize, $iCount;
}
printf "\tcount(e/a): %d/%d, bucket avg: \$%.2f, "
."deviation: \$%.3f\n\n"
, $iBucketSize, $iCount, $dBucketAvg, $dDeviation;
}
print "\n";
}
#------------------------------------------------------------
sub allocate($$) {
my ($hBuckets, $hFrequency) = @_;
#calculate deviations from the mean
my $dAvg=calcWeightedAvg($hFrequency);
my ($iFreqAvg, $aAbove, $aBelow)
= calcDeviations($hFrequency, $dAvg);
#sort buckets by size: smallest first
my $hBucketsBySize = groupBucketsBySize($hBuckets);
my @aBuckets = map { [ $_, $hBucketsBySize->{$_} ]
} sort keys %$hBucketsBySize;
#my @aBuckets = sort { $hBuckets->{$a} <=> $hBuckets->{$b}
# } keys %$hBuckets;
#allocate items to buckets, smallest first
my %hAllocations;
my $iFirstAbove = 0;
my $iFirstBelow = 0;
foreach my $aSameSizeBuckets (@aBuckets) {
my $iSize = $aSameSizeBuckets->[0];
my $aIds = $aSameSizeBuckets->[1];
#my $iSize = $hBuckets->{$sId};
fillBucket($iSize, \%hAllocations, $aIds
, $dAvg, \$iFreqAvg
, $aAbove, \$iFirstAbove
, $aBelow, \$iFirstBelow);
}
return ($dAvg, \%hAllocations);
}
#------------------------------------------------------------
# SUPPORTING FUNCTIONS - alphabetical order
#------------------------------------------------------------
sub calcDeviations($$) {
my ($hFrequency, $dAvg) = @_;
my @aAbove;
my @aBelow;
my $iFreqAvg = 0;
#calculate deviations from mean
while (my ($dValue,$iFreq) = each(%$hFrequency)) {
if ($dValue == $dAvg) {
$iFreqAvg+=$iFreq;
next;
}
my $dDeviation = $dValue - $dAvg;
if (0 < $dDeviation) {
push @aAbove, [ $dDeviation, $dValue, $iFreq ];
} else {
push @aBelow, [ -$dDeviation, $dValue, $iFreq ];
}
}
#sort with smallest deviations first
return ( $iFreqAvg
, [ sort { compareDeviations($a,$b) } @aAbove ]
, [ sort { compareDeviations($a,$b) } @aBelow ]
);
}
#------------------------------------------------------------
sub compareDeviations($$) {
my ($x, $y) = @_;
return $x->[0] <=> $y->[0];
}
#------------------------------------------------------------
sub calcWeightedAvg($) {
my $hFrequency = shift @_;
my $dSum=0;
my $iCount=0;
while (my ($dValue,$iFreq) = each(%$hFrequency)) {
$dSum+=$dValue*$iFreq;
$iCount+=$iFreq;
}
return $dSum/$iCount;
}
#------------------------------------------------------------
sub fillBucket($$$$$$$) {
my ($iSize, $hAllocations, $aIds
, $dAvg, $rFreqAvg
, $aAbove, $rFirstAbove
, $aBelow, $rFirstBelow) = @_;
my $iFilled = 0;
#Phase I - allocate to all buckets simultaneously
#fillBucketOneByOne($iSize, $hAllocations, $aIds, \$iFilled
# , $dAvg, $rFreqAvg
# , $aAbove, $rFirstAbove
# , $aBelow, $rFirstBelow);
#$iSize -= $iFilled;
#Phase II - allocate to each bucket individually
foreach my $sId (@$aIds) {
next unless $iSize;
$iFilled = 0;
fillBucketOneByOne($iSize
, $hAllocations, [$sId], \$iFilled
, $dAvg, $rFreqAvg
, $aAbove, $rFirstAbove
, $aBelow, $rFirstBelow);
}
}
#------------------------------------------------------------
sub fillBucketOneByOne($$$$$$$$$) {
my ($iNeeded, $hAllocations, $aIds, $rFilled
, $dAvg, $rFreqAvg
, $aAbove, $rFirstAbove
, $aBelow, $rFirstBelow) = @_;
my $iBucketCount = scalar(@$aIds);
#take items that are at the mean, if we can
if ($iNeeded*$iBucketCount <= $$rFreqAvg) {
foreach (@$aIds) {
$hAllocations->{$_}->{$dAvg} = $iNeeded;
$$rFreqAvg-=$iNeeded;
}
$$rFilled = $iNeeded;
return;
} elsif ($iNeeded % $iBucketCount) {
return;
}
while ($iBucketCount <= $$rFreqAvg) {
foreach (@$aIds) {
$hAllocations->{$_}->{$dAvg}++;
$$rFreqAvg--;
}
$$rFilled++;
$iNeeded--;
}
my $aUp = $aAbove->[$$rFirstAbove];
my $aDown = $aBelow->[$$rFirstBelow];
my $dNetDeviation = 0;
#take whatever creates the smallest net deviation
# [0] deviation
# [1] value
# [2] frequency
while ($iNeeded > 0) {
my $bUseUp = 0;
if ($aUp) {
if ($aDown) {
my $dNetUp = $dNetDeviation + $aUp->[0];
my $dNetDown = $dNetDeviation - $aDown->[0];
if (abs($dNetUp) < abs($dNetDown)) {
$bUseUp = 1;
$dNetDeviation = $dNetUp;
} else {
$bUseUp = 0;
$dNetDeviation = $dNetDown;
}
} else {
$bUseUp = 1;
}
} elsif ($aDown) {
$bUseUp = 0;
} else {
return;
}
if ($bUseUp) {
#$hItems->{$aUp->[1]} ++;
return if ($aUp->[2] % $iBucketCount);
foreach (@$aIds) {
$hAllocations->{$_}->{$aUp->[1]} ++;
$aUp->[2]--;
}
$$rFirstAbove++ unless $aUp->[2];
$aUp = $aAbove->[$$rFirstAbove];
} else {
#$hItems->{$aDown->[1]} ++;
return if ($aDown->[2] % $iBucketCount);
foreach (@$aIds) {
$hAllocations->{$_}->{$aDown->[1]} ++;
$aDown->[2]--;
}
$$rFirstBelow++ unless $aDown->[2];
$aDown = $aBelow->[$$rFirstBelow];
}
$$rFilled++;
$iNeeded--;
}
#return $hItems;
}
#------------------------------------------------------------
sub groupBucketsBySize($) {
my $hBuckets = shift @_;
my $hBucketsBySize = {};
while (my ($sId, $iSize) = each (%$hBuckets)) {
my $aIds = $hBucketsBySize->{$iSize};
$aIds = $hBucketsBySize->{$iSize} = [] unless $aIds;
push @$aIds, $sId;
}
return $hBucketsBySize;
}