include <stdio.h>
#define A 5
#define B 2
#define N 10
__global__ void functionG(float *input, float *output);
void cudaErr(const char *msg);
main(){
printf("hello CUDA\n");
float *x_h, *y_h, *x_d, *y_d;
size_t memSize = sizeof(float) * N;
x_h = (float *)malloc(memSize);y_h = (float *)malloc(memSize);
cudaMalloc( (void**)&x_d, memSize ); cudaErr("malloc x_d");
cudaMalloc( (void**)&y_d, memSize ); cudaErr("malloc y_d");
cudaMemset( x_d, 0.0, memSize);cudaErr("memset x_d");
cudaMemset( y_d, 0.0, memSize);cudaErr("memsety_d");
for( int i =0; i<N; i++){x_h[i]=i; y_h[i]=0.0;}
cudaMemcpy( x_d, x_h, memSize, cudaMemcpyHostToDevice); cudaErr("memcpy HtD");
functionG<<< A , B >>>(x_d, y_d); cudaErr("launch functionG");
cudaMemcpy( y_h, y_d, memSize, cudaMemcpyDeviceToHost); cudaErr("memcpy result");
for( int i =0; i<N; i++){printf("%d, %f %f \n",i, x_h[i], y_h[i] ); } return 0;
}
void __global__ functionG(float *input, float *output)
{
int idx = blockIdx.x *blockDim.x+threadIdx.x;
if ( idx < N) {
output[idx]= input[idx] + 0.001 * idx;
}
}
void cudaErr(const char *msg){
cudaError_t err = cudaGetLastError();
if( err !=cudaSuccess) {
printf("%d %s %s \n", err, msg, cudaGetErrorString(err) );
}
}
Big Data2014. 10. 29. 10:22