目前,我正在使用CUDA和OpenGL来模拟海洋。
我发现当顶点数量在6,000
或25,000
附近时,程序可以正常工作。但如果顶点数量在100,000
或400,000
附近,则会出现unspecified launch failure
错误。
这是我用来在帧中用CUDA更新顶点位置和法线的代码:
while (!glfwWindowShouldClose(window))
{
...
vec3 *d_vertices = NULL, *d_normals = NULL;
cudaGraphicsMapResources(1, &cudaVboResVertices, 0);
cudaGraphicsMapResources(1, &cudaVboResNormals, 0);
cudaGraphicsResourceGetMappedPointer(
(void**)&d_vertices, NULL, cudaVboResVertices
);
cudaGraphicsResourceGetMappedPointer(
(void**)&d_normals, NULL, cudaVboResNormals
);
//update vertices positions and normals
//faceNumber*3 is the number of vertices
launchGPUKernel(faceNumber*3, d_vertices, d_normals);
t += dt;
//the error first occurs at this line at the first iteration
cudaGraphicsUnmapResources(1, &cudaVboResNormals, 0);
cudaGraphicsUnmapResources(1, &cudaVboResVertices, 0);
...
}
launchGPUKernel
是这样的:
void launchGPUKernel(int num_points, vec3 *d_vtxs, vec3 *d_nmls){
dim3 grid(num_points/512 + 1, 1);
dim3 block(16, 64, 1);
d_update<<<grid, block>>>(num_points, WAVE_NUM, d_vtxs, d_nmls, d_wave_paras, t);
}
d_update
:
__global__ void d_update(
int num_points, int wave_num,
vec3 *d_vtxs, vec3 *d_nmls, float *d_wv_prs,
float d_time
){
long block_number = blockIdx.x + blockIdx.y*gridDim.x;
block_number *= (blockDim.x*blockDim.y);//the number of threads before current block
long idx = threadIdx.x + threadIdx.y*blockDim.x;
idx += block_number;
float x, z, height;
x = d_vtxs[idx].x;
z = d_vtxs[idx].z;
height = 0;
for (size_t j = 0; j < wave_num; j++) {
float a, b, theta, omega, phi;
a = d_wv_prs[j*5+0];
b = d_wv_prs[j*5+1];
theta = d_wv_prs[j*5+2];
omega = d_wv_prs[j*5+3];
phi = d_wv_prs[j*5+4];
float temp = (cos(theta)*x + sin(theta)*z)*omega + d_time*phi;
height += a*cos(temp) + b*sin(temp);
}
d_vtxs[idx].y = height;
float Hx, Hz;
Hx = 0;
Hz = 0;
for (size_t j = 0; j < wave_num; j++) {
float a, b, theta, omega, phi;
a = d_wv_prs[j*5+0];
b = d_wv_prs[j*5+1];
theta = d_wv_prs[j*5+2];
omega = d_wv_prs[j*5+3];
phi = d_wv_prs[j*5+4];
float temp = (cos(theta)*x + sin(theta)*z)*omega + d_time*phi;
Hx += -sin(temp)*omega*cos(theta)*a + cos(temp)*omega*cos(theta)*b;
Hz += -sin(temp)*omega*sin(theta)*a + cos(temp)*omega*sin(theta)*b;
}
vec3 v3_temp = normalize( vec3(-Hx, 1, -Hz) );
d_nmls[idx].x = v3_temp.x;
d_nmls[idx].y = v3_temp.y;
d_nmls[idx].z = v3_temp.z;
}
我不知道出了什么问题。
OS X EI Captitan 10.11.6
CUDA 8.0
OpenGL 4.0
答案 0 :(得分:0)
最后,我想通了。
问题是由于launchGPUKernel
中的代码,我设置的线程数超过了我拥有的顶点数:
dim3 grid(num_points/512 + 1, 1);
dim3 block(16, 64, 1);
这导致d_vtxs[idx]
和d_nmls[idx]
访问了不存在的地址。
当我将其更改为
时dim3 grid(num_points/(16*64), 1);
dim3 block(16, 64, 1);
它可以正常工作。