上,
我写了一个cuda程序,我已经给出了下面的内核函数。设备内存为
通过CUDAMalloc()分配;
* md的值是10;
__global__ void add(int *md)
{
int x,oper=2;
x=threadIdx.x;
* md = *md*oper;
if(x==1)
{
*md = *md*0;
}
if(x==2)
{
*md = *md*10;
}
if(x==3)
{
*md = *md+1;
}
if(x==4)
{
*md = *md-1;
}
}
执行了上面的代码
add<<<1,5>>(*md) , add<<<1,4>>>(*md)
for <<<1,5>>> the output is 19
for <<<1,4>>> the output is 21
1)我怀疑cudaMalloc()会在设备主内存中分配吗? 2)为什么总是在上面的程序中单独执行最后一个线程?
谢谢
答案 0 :(得分:1)
代码中的每个线程都将不同的输出写入相同的位置(md)。因此,当程序完成执行时, md 可以包含4-5个可能值中的任何一个。
如果你想捕捉每个线程的输出,这就是你应该做的事情
// The size of output is should be equal to the number of threads in your block
__global__ void add (int input, int * output){
int x = threadIdx.x;
int oper = 2;
md = md*oper;
//thread Index starts from 0 in CUDA
if(x==0)
output[0]= md*0; // output is 0
if(x==1)
output[1] = md*10; // output is 200
if(x==2)
output[2] = md+1; // output is 21
if(x==3)
output[3] = md-1; // output is 19
..... and so on
}
执行代码
int value = 10;
int * out;
int size = 5*sizeof(int);
cudaMalloc((void**)&out,size );
add<<<1,5>>(value,out)
int * host_out = (int*)malloc(size);
cudaMemcpy(host_out,out,size,cudaMemcpyDeviceToHost);
//Now the host_out should have the following values:
//host_out[0] = 0
//host_out[1] = 200
//host_out[2] = 21
//host_out[3] = 19
//host_out[4] = ..