don't know
This commit is contained in:
8
src/Cuda/Makefile
Normal file
8
src/Cuda/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
all: test
|
||||
|
||||
VectorOperations.o: VectorOperations.cu VectorOperations.h
|
||||
/usr/local/cuda-6.5/bin/nvcc -c VectorOperations.cu -o VectorOperations.o
|
||||
|
||||
test: VectorOperations.o VectorOperations.h test.cpp
|
||||
g++ -c -std=c++14 -O0 ./test.cpp -o test.o
|
||||
/usr/local/cuda-6.5/bin/nvcc ./test.o VectorOperations.o -o test
|
||||
12
src/Cuda/VectorOperations.cpp
Normal file
12
src/Cuda/VectorOperations.cpp
Normal file
@@ -0,0 +1,12 @@
|
||||
#include "./VectorOperations.h"
|
||||
|
||||
// CUDA kernel. Each thread takes care of one element of c
|
||||
__global__ void vecAdd(double *a, double *b, double *c, int n)
|
||||
{
|
||||
// Get our global thread ID
|
||||
int id = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
|
||||
// Make sure we do not go out of bounds
|
||||
if (id < n)
|
||||
c[id] = a[id] + b[id];
|
||||
}
|
||||
47
src/Cuda/VectorOperations.cu
Normal file
47
src/Cuda/VectorOperations.cu
Normal file
@@ -0,0 +1,47 @@
|
||||
#include "./VectorOperations.h"
|
||||
|
||||
// CUDA kernel. Each thread takes care of one element of c
|
||||
__global__ void __ADD(float *a, float *b, float *ret, int n)
|
||||
{
|
||||
// Get our global thread ID
|
||||
int id = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
|
||||
if (id >= n)
|
||||
return;
|
||||
|
||||
// Make sure we do not go out of bounds
|
||||
float tmp=0.0;
|
||||
for(int i=0;i<n;i++)
|
||||
{
|
||||
tmp+=a[n] * b[n];
|
||||
}
|
||||
ret[id] = tmp;
|
||||
}
|
||||
|
||||
Shin::Cuda::VectorOperations::VectorOperations(int size)
|
||||
{
|
||||
cudaMalloc(&clientA, sizeof(float)*size);
|
||||
cudaMalloc(&clientB, sizeof(float)*size);
|
||||
cudaMalloc(&clientC, sizeof(float)*size);
|
||||
}
|
||||
|
||||
Shin::Cuda::VectorOperations::~VectorOperations()
|
||||
{
|
||||
cudaFree(clientA);
|
||||
cudaFree(clientB);
|
||||
cudaFree(clientC);
|
||||
}
|
||||
|
||||
void Shin::Cuda::VectorOperations::add(float *a, float *b, float *ret, int n)
|
||||
{
|
||||
//cudaMemcpyAsync(clientA, a, n*sizeof(float), cudaMemcpyHostToDevice, );
|
||||
//cudaMemcpyAsync(clientB, a, n*sizeof(float), cudaMemcpyHostToDevice, );
|
||||
cudaMemcpy( clientA, a, n*sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy( clientB, b, n*sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy( clientC, ret, n*sizeof(float), cudaMemcpyHostToDevice);
|
||||
int blockSize, gridSize;
|
||||
blockSize = 1024;
|
||||
gridSize = (int)ceil((float)n/blockSize);
|
||||
__ADD<<< 1, n>>>(clientA, clientB, clientC, n);
|
||||
cudaMemcpy( ret, clientC, n*sizeof(float), cudaMemcpyDeviceToHost );
|
||||
}
|
||||
20
src/Cuda/VectorOperations.h
Normal file
20
src/Cuda/VectorOperations.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef VECT_OP_
|
||||
#define VECT_OP_
|
||||
namespace Shin
|
||||
{
|
||||
namespace Cuda
|
||||
{
|
||||
class VectorOperations
|
||||
{
|
||||
public:
|
||||
VectorOperations(int maxSize);
|
||||
~VectorOperations();
|
||||
void add(float *a, float *b, float *ret, int n);
|
||||
protected:
|
||||
float *clientA;
|
||||
float *clientB;
|
||||
float *clientC;
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
BIN
src/Cuda/a
Executable file
BIN
src/Cuda/a
Executable file
Binary file not shown.
66
src/Cuda/answer.cu
Normal file
66
src/Cuda/answer.cu
Normal file
@@ -0,0 +1,66 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void vector_add(int *a, int *b, int *c)
|
||||
{
|
||||
/* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
c[index] = a[index] + b[index];
|
||||
}
|
||||
|
||||
/* experiment with N */
|
||||
/* how large can it be? */
|
||||
#define N (2048*2048)
|
||||
#define THREADS_PER_BLOCK 512
|
||||
|
||||
int main()
|
||||
{
|
||||
int *a, *b, *c;
|
||||
int *d_a, *d_b, *d_c;
|
||||
int size = N * sizeof( int );
|
||||
|
||||
/* allocate space for device copies of a, b, c */
|
||||
|
||||
cudaMalloc( (void **) &d_a, size );
|
||||
cudaMalloc( (void **) &d_b, size );
|
||||
cudaMalloc( (void **) &d_c, size );
|
||||
|
||||
/* allocate space for host copies of a, b, c and setup input values */
|
||||
|
||||
a = (int *)malloc( size );
|
||||
b = (int *)malloc( size );
|
||||
c = (int *)malloc( size );
|
||||
|
||||
for( int i = 0; i < N; i++ )
|
||||
{
|
||||
a[i] = b[i] = i;
|
||||
c[i] = 0;
|
||||
}
|
||||
|
||||
/* copy inputs to device */
|
||||
/* fix the parameters needed to copy data to the device */
|
||||
cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
|
||||
cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
|
||||
|
||||
/* launch the kernel on the GPU */
|
||||
/* insert the launch parameters to launch the kernel properly using blocks and threads */
|
||||
vector_add<<< (N + (THREADS_PER_BLOCK-1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );
|
||||
|
||||
/* copy result back to host */
|
||||
/* fix the parameters needed to copy data back to the host */
|
||||
cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost );
|
||||
|
||||
|
||||
printf( "c[0] = %d\n",0,c[0] );
|
||||
printf( "c[%d] = %d\n",N-1, c[N-1] );
|
||||
|
||||
/* clean up */
|
||||
|
||||
free(a);
|
||||
free(b);
|
||||
free(c);
|
||||
cudaFree( d_a );
|
||||
cudaFree( d_b );
|
||||
cudaFree( d_c );
|
||||
|
||||
return 0;
|
||||
} /* end main */
|
||||
66
src/Cuda/answer.cu~
Normal file
66
src/Cuda/answer.cu~
Normal file
@@ -0,0 +1,66 @@
|
||||
#include <stdio.h>
|
||||
|
||||
__global__ void vector_add(int *a, int *b, int *c)
|
||||
{
|
||||
/* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
c[index] = a[index] + b[index];
|
||||
}
|
||||
|
||||
/* experiment with N */
|
||||
/* how large can it be? */
|
||||
#define N (2048*2048)
|
||||
#define THREADS_PER_BLOCK 512
|
||||
|
||||
int main()
|
||||
{
|
||||
int *a, *b, *c;
|
||||
int *d_a, *d_b, *d_c;
|
||||
int size = N * sizeof( int );
|
||||
|
||||
/* allocate space for device copies of a, b, c */
|
||||
|
||||
cudaMalloc( (void **) &d_a, size );
|
||||
cudaMalloc( (void **) &d_b, size );
|
||||
cudaMalloc( (void **) &d_c, size );
|
||||
|
||||
/* allocate space for host copies of a, b, c and setup input values */
|
||||
|
||||
a = (int *)malloc( size );
|
||||
b = (int *)malloc( size );
|
||||
c = (int *)malloc( size );
|
||||
|
||||
for( int i = 0; i < N; i++ )
|
||||
{
|
||||
a[i] = b[i] = i;
|
||||
c[i] = 0;
|
||||
}
|
||||
|
||||
/* copy inputs to device */
|
||||
/* fix the parameters needed to copy data to the device */
|
||||
cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );
|
||||
cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );
|
||||
|
||||
/* launch the kernel on the GPU */
|
||||
/* insert the launch parameters to launch the kernel properly using blocks and threads */
|
||||
add<<< (N + (THREADS_PER_BLOCK-1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );
|
||||
|
||||
/* copy result back to host */
|
||||
/* fix the parameters needed to copy data back to the host */
|
||||
cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost );
|
||||
|
||||
|
||||
printf( "c[0] = %d\n",0,c[0] );
|
||||
printf( "c[%d] = %d\n",N-1, c[N-1] );
|
||||
|
||||
/* clean up */
|
||||
|
||||
free(a);
|
||||
free(b);
|
||||
free(c);
|
||||
cudaFree( d_a );
|
||||
cudaFree( d_b );
|
||||
cudaFree( d_c );
|
||||
|
||||
return 0;
|
||||
} /* end main */
|
||||
BIN
src/Cuda/test
Executable file
BIN
src/Cuda/test
Executable file
Binary file not shown.
44
src/Cuda/test.cpp
Normal file
44
src/Cuda/test.cpp
Normal file
@@ -0,0 +1,44 @@
|
||||
#include "./VectorOperations.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
void _hack(float *a)
|
||||
{
|
||||
a[0]=a[0];
|
||||
}
|
||||
int main()
|
||||
{
|
||||
int size=50000000;
|
||||
float *a= new float[size];
|
||||
float *b= new float[size];
|
||||
float *c= new float[size];
|
||||
double sum = 0;
|
||||
Shin::Cuda::VectorOperations v(size);
|
||||
for(int i=0;i<size;i++)
|
||||
{
|
||||
a[i]=0.001;
|
||||
b[i]=2;
|
||||
c[i]=0;
|
||||
}
|
||||
|
||||
_hack(a);
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
for(int i=0;i<size;i++)
|
||||
{
|
||||
sum+=a[i]*b[i];
|
||||
}
|
||||
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::cout << "Time 1: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() << "result: " << sum <<std::endl;
|
||||
|
||||
for(int i=0;i<size;i++)
|
||||
c[i]=0;
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
v.add(a,b,c,size);
|
||||
sum=0;
|
||||
for(int i=0;i<size;i++)
|
||||
sum+=c[i];
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
std::cout << "Time 2: " << std::chrono::duration_cast<std::chrono::milliseconds>(t2-t1).count() << "result: " << sum << std::endl;
|
||||
}
|
||||
Reference in New Issue
Block a user