我有一个cuda内核,它从第i + 1个位置复制到设备阵列中的第i个位置。复制不是从索引值是32的倍数的位置进行的。[32] - > [31]未复制,[64] - > [63]未复制。无论块大小如何,都会发生这种情况。如何解决这个问题? 这是完整的计划。没有调用syncthreads()。问题仍然存在。
#include <cstdio>
struct SodA { float *df0; size_t pitch; };
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N);
int main(int argc, char **argv){
int i, M=32, N=32;float *f0;
SodA dA1, dA2;
dim3 blockSize = dim3(32,32);
dim3 gridSize = dim3(1,1);
f0 = (float *)malloc(M*N*sizeof(float));
cudaMallocPitch((void **)&dA1.df0, &dA1.pitch, sizeof(float)*M, N);
cudaMallocPitch((void **)&dA2.df0, &dA2.pitch, sizeof(float)*M, N);
for (i=0; i<M*N; i++) f0[i] = (float)rand()/RAND_MAX;
cudaMemcpy2D((void *)dA1.df0, dA1.pitch, (void *)f0, sizeof(float)*M, sizeof(float)*M, N, cudaMemcpyHostToDevice);
printf("\n");
for(int i=28;i<70; i++)
printf("%5d ", i);
printf("\n\n");
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
stream_kernel<<<gridSize, blockSize>>>(dA1, dA2, M, N);
cudaMemcpy2D( (void *)f0, sizeof(float)*M, (void *)dA2.df0, dA2.pitch,sizeof(float)*M, N, cudaMemcpyDeviceToHost);
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
free(f0);cudaFree(dA2.df0);
cudaFree(dA1.df0);
printf("\n\n");
return 0;
}
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N)
{
int i, j, i2d;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
i2d = i + j * M;
if (i2d>0) { dA2.df0[i2d-1] = dA1.df0[i2d];}
}
输出
28 29 30 31 32 33 ....
0.999 0.218 0.513 0.839 0.613 0.296 0.638....
0.218 0.513 0.839 0.198 0.296 0.638 ....
答案 0 :(得分:1)
感谢您的评论。在以行主要顺序存储的2D数组中,该内核将第(i,j)位置移动到其先前位置。由于数组是倾斜的,如注释中所述,使用-1 offset无法找到每行中第一个元素的前一个元素。通过计算前一个数组中的最后一个元素来处理这种特殊情况。我得到了答案。感谢。