CUDA 교육
#include <stdio.h>
#define NN 10
//__device__ float a[10]; /global memory
__global__
void function ( float* input, int size ){
/* GPU SOURCE CODING HERE!! */
//int i=0; //define in register. i was define <<<3, 4>>> so make 12EA
//int j=0;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (size < blockDim.x * gridDim.x){
input[i] = input[i]*input[i];
return;
}else{
return;
}
}
int main()
{
float *a, *b_dev; //a:CPU , b_dev:GPU
size_t memSize = sizeof(float) * NN;
a = (float*) malloc( memSize ); //float a[10]
//b = (float*) cudaMalloc ( sizeof(float) * 10 ); //This is can't use so use like next line
cudaMalloc( (void**)&b_dev, memSize * NN );
cudaMemcpy( b_dev, a, memSize, cudaMemcpyHostToDevice); //Upload
function <<<3, 4>>> ( b_dev, NN );
cudaMemcpy (a, b_dev, memSize, cudaMemcpyDeviceToHost); //Download
printf("ERR MASSAGE :\n%s\n", cudaGetErrorString( cudaGetLastError() ));
return 0;
}