Holy crap!
So I decided to try it after all, I tested the C implementation (see below) on the same dataset that took PDL 80m44s, and it finished in 1m42s! Simply compiling with -O3 reduced that further to 26 seconds! Or about 186 times faster.
That pretty much answers my initial question. :) The only reason I was reticent to try this from the beginning is that I know almost nothing about C, so even these simple things take me a while.
I'm including the C program if anyone is interested, but it's really embarassingly trivial.
#include <stdio.h>
#include <malloc.h>
float pearson(int size, float *z1, float *z2);
void die(char *message);
void *cmalloc(int size);
int main(int argc, char *argv[]) {
int rows = atoi(argv[1]);
int cols = atoi(argv[2]);
int i, j;
int count = 0;
int total = rows * rows / 2 - (rows / 2);
fprintf(stderr, "allocating %d by %d array\n", rows, cols);
float **data = (float **)cmalloc(rows * sizeof(float *));
for(i = 0; i < rows; i++)
data[i] = (float *)cmalloc(cols * sizeof(float));
fprintf(stderr, "reading stdin\n");
for(i = 0; i < rows; i++)
if (fread(data[i], sizeof(float), cols, stdin) != cols)
die("Error reading data from file.");
fprintf(stderr, "computing correlations\n");
for (i = 0; i < rows; i++) {
for(j = i + 1; j < rows; j++) {
pearson(cols, data[i], data[j]);
if(count % 500000 == 0)
fprintf(stderr, "\r%d / %d", count, total);
count++;
}
}
fprintf(stderr, "\n");
}
// ---------------------------------------------------------
float pearson(int size, float *z1, float *z2){
int i;
double sum = 0;
for(i = 0; i < size; i++)
sum += z1[i] * z2[i];
return sum / size;
}
void die(char *message) {
fprintf(stderr, "oopsies: %s\n", message);
exit(1);
}
void *cmalloc(int size) {
void *ptr = malloc(size);
if (ptr == NULL)
die("malloc failed");
return ptr;
}