我目前正在移植一个CUDA代码,该代码可以找到每个节点到(无向)图中其他节点的最短路径。 基本上,CUDA代码构造从文本文件中读取的图形。然后继续构建相邻数组h_v和h_e。
For example
A B
A C
B C
gives
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2
然后它调用内核使用BFS计算每个节点的最短路径。
cuda主机代码如下:
int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
int *d_v, *d_e;
cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count));
cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));
int *d_d, *d_dist;
cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
bool *d_continue;
cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));
for(int s=0; s<n_count; s++){ //BIG FOR LOOP
//////code to initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0; //for marking the root
cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
//////////////////////////////
///////////////////////////////
int threads_per_block=e_count;
int blocks=1;
if(e_count>MAX_THREADS_PER_BLOCK){
blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK);
threads_per_block = MAX_THREADS_PER_BLOCK;
}
dim3 grid(blocks);
dim3 threads(threads_per_block);
/////////////////////////////////
bool h_continue;
int h_dist=0;
cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
do{
h_continue=false;
cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
checkCUDAError("Kernel invocation");
cudaThreadSynchronize();
h_dist++;
cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
}while(h_continue);
///////////////////
//then code to read back h_d from device
}
这是cuda内核
__global__ void cc_bfs_kernel(int *d_v, int *d_e, int *d_d,
bool *d_continue, int *d_dist, int e_count){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<e_count){
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1){ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
以下是我将其移植到openCL的努力。我只是openCL中的业余爱好者,所以我正在努力逐行移植原始代码:(
openCL主机代码
cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);
cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);
size_t global_size= e_count;
for(int s=0; s<n_count; s++)
{ //BIG LOOP
//initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0;
//copy h_d to d_d
err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
n_count * sizeof(int), h_d, 0, NULL, NULL);
bool h_continue;
int h_dist=0;
int mark = 0;
int* h_id;
h_id= (int*) malloc(sizeof(int)*e_count);
cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
sizeof(int)*e_count, NULL,NULL);
do{
h_continue=false;
err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);
/////EXECUTE
cl_event sync1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
&global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)
err = clWaitForEvents(1, &sync1);
clReleaseEvent(sync1);
///////////////////
err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
sizeof(int)*e_count, h_id, 0, NULL, NULL);
printf("e_count = %d error : %d\n",e_count, err);//check error?
for(int j = 0; j< e_count; j++)
{
printf("%d ",h_id[j]);
}
h_dist++;
mark++;//for debug
err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
sizeof(int), &h_dist, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
}
while(h_continue);
err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
n_count*sizeof(int), h_d, 0, NULL, NULL);
和openCL内核
__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int *d_d,
__global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{
int tid = get_global_id(0)-get_global_offset(0);
//barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = 0; i< e_count; i++)
{
id[i]=i;
}
if(tid<e_count){
id[tid]= tid;
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1)
{ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
代码无法给出正确的结果,所以我通过打印一些值(内核中的tid,标记值来检查代码通过while循环的次数)来调试它。可悲的是,tid给出了垃圾值,它只通过while循环一次。你能指出我在这里缺少的东西吗?
我有另一个疑问: 我怎样才能做类似cudathreadsynchronize()的事情?在这个版本的openCL中,我将clEnqueueNDRangeKernel与命令事件关联起来并等待它,但显然我似乎无法工作:(
非常感谢。
答案 0 :(得分:1)
首先,您应该通过检查错误代码确保每一步都正确。
例如,AFAIK,此代码无效:
clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*e_count, NULL,NULL)
因为您要求使用分配的内存区域,而您没有提供任何内容: host_ptr 参数确实是 NULL 。< / p>
删除此标志或如果您确实需要主机内存指定: CL_MEM_ALLOC_HOST_PTR 。
检查API文档以了解每个函数如何检索状态:使用返回值或专用参数(最后一个),如clCreateBuffer:http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateBuffer.html
对于您的代码,它应该提供 CL_INVALID_HOST_PTR 错误。