我正在使用CUDA优化一些代码。我不确定我是否应该在_ _ global _ _ function(fun1)中使用cudaMalloc(不是已经在GPU的内存中分配了x?):
__global__ void fun2(double *y)
{
int i=blockIdx.x;
y[i]=...;
}
__global__ void fun1(double *x)
{
//should I cudaMalloc() y for fun2 or just use the x which was already allocated in main?
fun2<<<N,1>(x);
...
}
int main(){
double *x;
...
cudaMalloc((void**)&x, N*sizeof(double));
fun1<<<N,1>>>(x);
...
}
答案 0 :(得分:1)
你的意思是这样的:
__device__ void fun2(double *y)
{
int i=blockIdx.x;
y[i]=...;
}
__global__ void fun1(double *x)
{
fun2(x);
...
}
int main(){
double *x;
...
cudaMalloc((void**)&x, N*sizeof(double));
fun1<<<N,1>>>(x);
...
}
但是在全局函数
中计算threadId很常见