Question

我正在使用CUDA优化一些代码。我不确定我是否应该在_ _ global _ _ function（fun1）中使用cudaMalloc（不是已经在GPU的内存中分配了x？）：

__global__ void fun2(double *y)
{
    int i=blockIdx.x;
    y[i]=...;
}

__global__ void fun1(double *x)
{
    //should I cudaMalloc() y for fun2 or just use the x which was already allocated in main?
    fun2<<<N,1>(x);
    ...
}

int main(){
    double *x;
    ...
    cudaMalloc((void**)&x, N*sizeof(double));
    fun1<<<N,1>>>(x);
    ...
}

Answer 1

你的意思是这样的：

 __device__ void fun2(double *y)
    {
      int i=blockIdx.x;
      y[i]=...;
}

__global__ void fun1(double *x)
{

    fun2(x);
    ...
}

int main(){
    double *x;
    ...
    cudaMalloc((void**)&x, N*sizeof(double));
    fun1<<<N,1>>>(x);
    ...
}

但是在全局函数

中计算threadId很常见

CUDA（嵌套？）内存分配

1 个答案: