#include "./VectorOperations.h" // CUDA kernel. Each thread takes care of one element of c __global__ void __ADD(float *a, float *b, float *ret, int n) { // Get our global thread ID int id = blockIdx.x*blockDim.x+threadIdx.x; if (id >= n) return; // Make sure we do not go out of bounds float tmp=0.0; for(int i=0;i>>(clientA, clientB, clientC, n); cudaMemcpy( ret, clientC, n*sizeof(float), cudaMemcpyDeviceToHost ); }