我是cuda的新手并试图编写一个应该在球体上生成随机点的代码。这是代码。
__global__
void setup_kernel(curandStateMRG32k3a *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
int threads = 256;
dim3 grid = size / threads;
setup_kernel<<<grid,threads>>>(devStates);
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
如果向量中的元素数小于4000(我尝试过1K,2K,3K和4K),则此代码有效。比它给我第一个cudaMemcpy中的cuda Error Illegal Address。我不认为我的内存耗尽,我正在使用gtx 980(4GB的全局内存)。知道如何解决这个问题吗?
编辑:建议修改后的代码如下:
__global__
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < numberOfElements) curand_init(0, id, 0, &state[id]);
}
__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}
void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}
size_t size = h_x.size() * sizeof(float);
float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}
float* d_p_x;
float* d_p_y;
float* d_p_z;
if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}
if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}
int threads = 512;
dim3 grid = (h_x.size() + threads - 1) / threads;
setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
cudaDeviceSynchronize();
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}
free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}
我很遗憾留在这里,但我想通过了解我现在的错误,我想我可能会更好地了解cuda。 所以,现在当h_x.size()为20k时,我在cudaMemcpy device-&gt; host上获得errorIllegalAdress。我仍然不明白代码如何适用于小数字而不适用于大数字。
答案 0 :(得分:2)
问题在于:
size_t size = h_x.size() * sizeof(float);
...
int threads = 256;
dim3 grid = size / threads;
您的size
变量按字节数缩放。因此,这不是用于网格大小的正确变量。您应该像这样计算网格大小:
dim3 grid = h_x.size() / threads;
或类似的。另请注意,此构造不能正确初始化所有curand状态,除非向量长度(h_x.size()
)可被threads
整除,即256.解决此问题的方法是包括线程检查在您的setup_kernel
中与您的其他内核中的类似:
__global__
void setup_kernel(curandStateMRG32k3a *state, int size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < size)
curand_init(0, id, 0, &state[id]);
}
并启动足够的线程来覆盖矢量大小:
dim3 grid = (h_x.size()+threads-1) / threads;