don't know

2014-12-10 16:01:53 +01:00
parent 993b4d3f04
commit aab9a073e9
35 changed files with 725 additions and 100 deletions
--- a/src/Cuda/Makefile
+++ b/src/Cuda/Makefile
@@ -0,0 +1,8 @@
+all: test
+
+VectorOperations.o: VectorOperations.cu VectorOperations.h
+	/usr/local/cuda-6.5/bin/nvcc -c VectorOperations.cu -o VectorOperations.o
+
+test: VectorOperations.o VectorOperations.h test.cpp
+	g++  -c -std=c++14 -O0 ./test.cpp -o test.o
+	/usr/local/cuda-6.5/bin/nvcc ./test.o VectorOperations.o -o test
--- a/src/Cuda/VectorOperations.cpp
+++ b/src/Cuda/VectorOperations.cpp
@@ -0,0 +1,12 @@
+#include "./VectorOperations.h"
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n)
+{
+    // Get our global thread ID
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+ 
+    // Make sure we do not go out of bounds
+    if (id < n)
+        c[id] = a[id] + b[id];
+}
--- a/src/Cuda/VectorOperations.cu
+++ b/src/Cuda/VectorOperations.cu
@@ -0,0 +1,47 @@
+#include "./VectorOperations.h"
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void __ADD(float *a, float *b, float *ret, int n)
+{
+    // Get our global thread ID
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+ 
+    if (id >= n)
+		return;
+
+	// Make sure we do not go out of bounds
+	float tmp=0.0;
+	for(int i=0;i<n;i++)
+	{
+		tmp+=a[n] * b[n];
+	}
+    ret[id] = tmp;
+}
+
+Shin::Cuda::VectorOperations::VectorOperations(int size)
+{
+	cudaMalloc(&clientA, sizeof(float)*size);
+	cudaMalloc(&clientB, sizeof(float)*size);
+	cudaMalloc(&clientC, sizeof(float)*size);
+}
+
+Shin::Cuda::VectorOperations::~VectorOperations()
+{
+	cudaFree(clientA);
+	cudaFree(clientB);
+	cudaFree(clientC);
+}
+
+void Shin::Cuda::VectorOperations::add(float *a, float *b, float *ret, int n)
+{
+	//cudaMemcpyAsync(clientA, a, n*sizeof(float), cudaMemcpyHostToDevice, );
+	//cudaMemcpyAsync(clientB, a, n*sizeof(float), cudaMemcpyHostToDevice, );
+	cudaMemcpy( clientA, a, n*sizeof(float), cudaMemcpyHostToDevice);
+	cudaMemcpy( clientB, b, n*sizeof(float), cudaMemcpyHostToDevice);
+	cudaMemcpy( clientC, ret, n*sizeof(float), cudaMemcpyHostToDevice);
+	int blockSize, gridSize;
+	blockSize = 1024;
+	gridSize = (int)ceil((float)n/blockSize);
+	__ADD<<< 1, n>>>(clientA, clientB, clientC, n);
+	cudaMemcpy( ret, clientC, n*sizeof(float), cudaMemcpyDeviceToHost );
+}
--- a/src/Cuda/VectorOperations.h
+++ b/src/Cuda/VectorOperations.h
@@ -0,0 +1,20 @@
+#ifndef VECT_OP_
+#define VECT_OP_
+namespace Shin
+{
+namespace Cuda
+{
+	class VectorOperations
+	{
+		public:
+			VectorOperations(int maxSize);
+			~VectorOperations();
+			void add(float *a, float *b, float *ret, int n);
+		protected:
+			float *clientA;
+			float *clientB;
+			float *clientC;
+	};
+}
+}
+#endif
--- a/src/Cuda/a
+++ b/src/Cuda/a
--- a/src/Cuda/answer.cu
+++ b/src/Cuda/answer.cu
@@ -0,0 +1,66 @@
+#include <stdio.h>
+
+__global__ void vector_add(int *a, int *b, int *c)
+{
+    /* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+	c[index] = a[index] + b[index];
+}
+
+/* experiment with N */
+/* how large can it be? */
+#define N (2048*2048)
+#define THREADS_PER_BLOCK 512
+
+int main()
+{
+    int *a, *b, *c;
+	int *d_a, *d_b, *d_c;
+	int size = N * sizeof( int );
+
+	/* allocate space for device copies of a, b, c */
+
+	cudaMalloc( (void **) &d_a, size );
+	cudaMalloc( (void **) &d_b, size );
+	cudaMalloc( (void **) &d_c, size );
+
+	/* allocate space for host copies of a, b, c and setup input values */
+
+	a = (int *)malloc( size );
+	b = (int *)malloc( size );
+	c = (int *)malloc( size );
+
+	for( int i = 0; i < N; i++ )
+	{
+		a[i] = b[i] = i;
+		c[i] = 0;
+	}
+
+	/* copy inputs to device */
+	/* fix the parameters needed to copy data to the device */
+	cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
+	cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
+
+	/* launch the kernel on the GPU */
+	/* insert the launch parameters to launch the kernel properly using blocks and threads */ 
+	vector_add<<< (N + (THREADS_PER_BLOCK-1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );
+
+	/* copy result back to host */
+	/* fix the parameters needed to copy data back to the host */
+	cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost );
+
+
+	printf( "c[0] = %d\n",0,c[0] );
+	printf( "c[%d] = %d\n",N-1, c[N-1] );
+
+	/* clean up */
+
+	free(a);
+	free(b);
+	free(c);
+	cudaFree( d_a );
+	cudaFree( d_b );
+	cudaFree( d_c );
+	
+	return 0;
+} /* end main */
--- a/src/Cuda/answer.cu~
+++ b/src/Cuda/answer.cu~
@@ -0,0 +1,66 @@
+#include <stdio.h>
+
+__global__ void vector_add(int *a, int *b, int *c)
+{
+    /* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+	c[index] = a[index] + b[index];
+}
+
+/* experiment with N */
+/* how large can it be? */
+#define N (2048*2048)
+#define THREADS_PER_BLOCK 512
+
+int main()
+{
+    int *a, *b, *c;
+	int *d_a, *d_b, *d_c;
+	int size = N * sizeof( int );
+
+	/* allocate space for device copies of a, b, c */
+
+	cudaMalloc( (void **) &d_a, size );
+	cudaMalloc( (void **) &d_b, size );
+	cudaMalloc( (void **) &d_c, size );
+
+	/* allocate space for host copies of a, b, c and setup input values */
+
+	a = (int *)malloc( size );
+	b = (int *)malloc( size );
+	c = (int *)malloc( size );
+
+	for( int i = 0; i < N; i++ )
+	{
+		a[i] = b[i] = i;
+		c[i] = 0;
+	}
+
+	/* copy inputs to device */
+	/* fix the parameters needed to copy data to the device */
+	cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
+	cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
+
+	/* launch the kernel on the GPU */
+	/* insert the launch parameters to launch the kernel properly using blocks and threads */ 
+	add<<< (N + (THREADS_PER_BLOCK-1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );
+
+	/* copy result back to host */
+	/* fix the parameters needed to copy data back to the host */
+	cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost );
+
+
+	printf( "c[0] = %d\n",0,c[0] );
+	printf( "c[%d] = %d\n",N-1, c[N-1] );
+
+	/* clean up */
+
+	free(a);
+	free(b);
+	free(c);
+	cudaFree( d_a );
+	cudaFree( d_b );
+	cudaFree( d_c );
+	
+	return 0;
+} /* end main */
--- a/src/Cuda/test
+++ b/src/Cuda/test
--- a/src/Cuda/test.cpp
+++ b/src/Cuda/test.cpp
@@ -0,0 +1,44 @@
+#include "./VectorOperations.h"
+#include <chrono>
+#include <iostream>
+void _hack(float *a)
+{
+	a[0]=a[0];
+}
+int main()
+{
+	int size=50000000;
+	float *a= new float[size];
+	float *b= new float[size];
+	float *c= new float[size];
+	double sum = 0;
+	Shin::Cuda::VectorOperations v(size);
+	for(int i=0;i<size;i++)
+	{
+		a[i]=0.001;
+		b[i]=2;
+		c[i]=0;
+	}
+
+	_hack(a);
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+	for(int i=0;i<size;i++)
+	{
+		sum+=a[i]*b[i];
+	}
+
+    auto t2 = std::chrono::high_resolution_clock::now();
+
+	std::cout << "Time 1: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() << "result: " << sum <<std::endl;
+
+	for(int i=0;i<size;i++)
+		c[i]=0;
+	t1 = std::chrono::high_resolution_clock::now();
+	v.add(a,b,c,size);
+	sum=0;
+	for(int i=0;i<size;i++)
+		sum+=c[i];
+    t2 = std::chrono::high_resolution_clock::now();
+	std::cout << "Time 2: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() << "result: " << sum << std::endl;
+}