我在CUDA设备上处理2D数组时遇到了一个奇怪的问题。
#define VR 100 // rows
#define ST 13 // columns
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
我没有均匀分布的数字,而是它们以13的顺序出现,其间有一堆零。请参阅:http://pastie.org/6106381
答案 0 :(得分:4)
问题是原始数据数组是使用cudaMallocPitch
分配的,而复制是使用普通cudaMemcpy
完成的。这将产生意外结果,因为cudaMallocPitch
操作创建“填充”行以满足对齐要求,而cudaMemcpy假定所有内容都是连续存储的。下面是我认为有更正功能的代码:
#include <stdio.h>
#include <iostream>
#include <curand_kernel.h>
#define VR 100 // rows
#define ST 13 // columns
__device__ float generate(curandState* globalState, int id)
{
//int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curandState localState = globalState[id];
float rand;
do {
rand = curand_uniform( &localState );
} while(rand == 0); //
globalState[id] = localState;
return rand;
}
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
//cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
using namespace std;
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy2D(h_arr, ST*sizeof(float), d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
}
使用以下代码编译上述代码:
nvcc -arch=sm_20 -lcurand -o t70 t70.cu
然后运行我得到的似乎是“正常”输出:
N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869
(结果被截断)
答案 1 :(得分:0)
我认为这是错误的,你应该分配VR数量的线程或块,因为你已经在内核中循环了ST。
也许这会解决它。