Question

我正在尝试在主机上创建一个指针数组。数组中的每个指针指向大小为4的数组。当我尝试将指针复制到设备时，复制失败，设备无法访问指针所指向的数组的内容。如何从指向数组的指针数组中复制指针，从主机到设备？

__global__ void kernel(int* D)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < 4)
    {
        printf("Device = %d\n", D[tid]);
        tid += blockDim.x * gridDim.x;
    }
}

int main(void)
{
    cudaProfilerStart();

    int* H[2];
    int* D[2]; 
    int test1[4] = { 1, 2, 3, 4 };
    int test2[4] = { 10, 20, 30, 40 };

    H[0] = test1;
    H[1] = test2;

    HANDLE_ERROR(cudaMalloc((void**)&D[0], 4 * sizeof(int)));
    HANDLE_ERROR(cudaMemcpy(D[0], H[0], 4 * sizeof(int), cudaMemcpyHostToDevice));
    kernel <<<1, 4 >>>(D[0]);

    cudaProfilerStop();

    return 0;
}

Answer 1

正如talonmies所指出的那样，守则没有任何问题。但是，您不会在内核中看到打印件，原因是内核调用是异步的，并且您的进程在内核打印之前可以执行。同步调用将在此处解决此问题。但是，在实际代码中可能不需要这样做。

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <stdio.h>



__global__ void kernel(int* D)
{
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
        while (tid < 4)
        {
                printf("Device = %d\n", D[tid]);
                tid += blockDim.x * gridDim.x;
        }
}

int main(void)
{
        // cudaProfilerStart();

        int* H[2];
        int* D[2];
        int test1[4] = { 1, 2, 3, 4 };
        int test2[4] = { 10, 20, 30, 40 };

        H[0] = test1;
        H[1] = test2;

        cudaMalloc((void**)&D[0], 4 * sizeof(int));
        cudaMemcpy(D[0], H[0], 4 * sizeof(int), cudaMemcpyHostToDevice);
        kernel <<<1, 1 >>>(D[0]);

        cudaError_t cudaerr1 = cudaDeviceSynchronize();
        if (cudaerr1 != cudaSuccess)
                printf("kernel launch failed with error \"%s\".\n",
                        cudaGetErrorString(cudaerr1));

         //cudaProfilerStop();

        return 0;
}

cudamemcpy指针数组，其中每个指针指向一个数组

1 个答案: