这就像我想做的事情:
__global__ void malloc(int **d_array)
{
int* ptr = (int*)malloc(x);
// x is different within each thread and cannot be determined
// compile time..
// then fill that space with something and put it in the d_array
d_array[threadIdx.x] = ptr;
}
void main()
{
int **d_array; int *h_array;
int blockSize = 32;
cudaThreadSetLimit(cudaLimitMallocHeapSize, somethingBigEnough);
cudaMalloc(d_array, sizeof(int *) * 32);
mallocTest<<<1, blockSize>>>(d_array);
cudaThreadSynchronize();
//here is what I want:
h_array = (int *)malloc(sizeof(int) * maxSize);
cudaDeepMemcpy(h_array, d_array, cudaMemcpyDeviceToHost);
}
我正在寻找类似于&#34; deepMemcpy&#34;在库达。如果不支持,我可以用什么方法将生成的运行时数据复制回主机?