Question

我正在使用CUDA C，并且有一个数组

arr[N][N] = {
    { 0, 1, 12, 13 },
    { 0, 1,  2,  3 },
    { 0, 1, 22, 33 },
    { 0, 1, 12, 13 }
};

我需要使每个线程对arr的2个元素执行操作。

例如：

thread 0: { 0, 1, 12, 13 }, { 0, 1,  2,  3 }
thread 1: { 0, 1, 22, 33 }, { 0, 1, 12, 13 }

这是我尝试过的内核代码的一部分：

__global__ void kernel(int *A)
{
    int tab[N*N];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for(k = (tid * N), j = 0; j < N, k < (tid + 1) * N * 2; j++, k++) 
        tab[j] = A[k];
}

现在线程0具有以下元素：

tab[8] = { 0, 1, 12, 13, 0, 1, 2, 3 };

和线程1：

tab[8] = { 0, 1, 22, 33, 0, 1, 12, 13 };

在这个示例中，我需要知道如何制作线程0：

tab[][] = {
    { 0, 1, 12, 13 },
    { 0, 1,  2,  3 }
}

这是内核调用：

int main()
{

    int *arr_device;
    int arr[N][N] = {
                     { 0, 1, 12, 13 },
                     { 0, 1,  2,  3 },
                     { 0, 1, 22, 33 },
                     { 0, 1, 12, 13 }
    };

    const size_t arr_size = sizeof(int) *N*N;

    cudaMalloc((void **)&arr_device, arr_size); 

    cudaMemcpy(arr_device, arr, arr_size, cudaMemcpyHostToDevice); 

    crossover<<<1,2>>>(arr_device);

    cudaFree(arr_device);

    return 0;
}

在2D数组上并行执行线程操作

0 个答案: