#!/usr/bin/perl

# by bliako @ PerlMonks.org
# date: 01-Jul-2021
# see https://perlmonks.org/?node_id=11134582
# lame example for utilising GPGPU via Inline::C
# TODO: extend to taking params and returning back results

use strict;
use warnings;

use FindBin;

use Inline C => Config =>
	cc => $FindBin::Bin.'/nvcc-compile.pl',
	ld => $FindBin::Bin.'/nvcc-link.pl',
;

use Inline C => <<'EOC';
// from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
// NOTE: don't use main(void), use main()!!!
#include <stdio.h>

AV *do_saxpy(int N, SV *_x, SV *_y);
int array_numelts(SV *array);

__global__
void saxpy(int n, double a, double *x, double *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int array_numelts(SV *array){
	int numelts;
	if( (!SvROK(array))
	 || (SvTYPE(SvRV(array)) != SVt_PVAV)
	 || ((numelts = av_len((AV *)SvRV(array))) < 0)
	) return -1;
	return numelts;
}

/* returns an arrayref of results */
AV* do_saxpy(
	int N,
	SV *_x,
	SV *_y
)
{
  double *x, *y, *d_x, *d_y;
  int nX, nY, i;

  AV *ret = newAV();
  sv_2mortal((SV*)ret);

  if( N <= 0 ){ fprintf(stderr, "error, N must be positive.\n"); return NULL; }

  if( ((nX=array_numelts(_x))<0)
    ||((nY=array_numelts(_y))<0)
  ){ fprintf(stderr, "err\n"); return NULL; }

  x = (double*)malloc(N*sizeof(double));
  y = (double*)malloc(N*sizeof(double));

  cudaMalloc(&d_x, N*sizeof(double)); 
  cudaMalloc(&d_y, N*sizeof(double));

  AV *deref_x = (AV *)SvRV(_x),
     *deref_y = (AV *)SvRV(_y);
  SV **dummy;
  for(i=0;i<N;i++){
    dummy = av_fetch(deref_x, i, 0);
    x[i] = SvNV(*dummy);
    dummy = av_fetch(deref_y, i, 0);
    y[i] = SvNV(*dummy);
    printf("do_saxpy() : got in x[%d]=%lf and y[%d]=%lf\n", i, x[i], i, y[i]);
  }

  cudaMemcpy(d_x, x, N*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(double), cudaMemcpyHostToDevice);

  // Perform SAXPY on 1M elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  // this copies data from GPU (dy) onto CPU memory, we use y because
  // it's just sitting there and no longer needed
  cudaMemcpy(y, d_y, N*sizeof(double), cudaMemcpyDeviceToHost);

  /* add some rubbish to return back as array ref */
  for(i=0;i<N;i++){
	av_push(ret, newSVnv(y[i]));
  }

  double maxError = 0.0f;
  for(i=0;i<N;i++){
    maxError = max(maxError, abs(y[i]-4.0f));
  }
  printf("do_saxpy() : Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);

  return ret;
}
EOC

my $N = 100; #1<<20;

my @x = map { rand() } 1..$N;
my @y = map { rand() } 1..$N;

my $err = do_saxpy($N, \@x, \@y);
if( ! defined $err ){ print STDERR "$0 : error, call to do_saxpy() has failed.\n"; exit(1); }
printf "$0 : back to perl-code ...\n";
print "$0 : (perl-code) : got back result :\n".join("\n", @$err)."\n";