我正在尝试在CUDA上运行并行BFS算法
这是我的内核代码
__global__ void initialize_vertices(int* vertices, int starting_vertex){
int v = blockDim.x * blockIdx.x + threadIdx.x;
if( v == starting_vertex){
vertices[v] = 0;
}else{
vertices[v] = -1;
}
}
__global__ void bfs(const Edge* edges, int* vertices, int* current_depth, bool* done){
int e = blockDim.x * blockIdx.x + threadIdx.x;
int vfirst = edges[e].first;
int dfirst = vertices[vfirst];
int vsecond = edges[e].second;
int dsecond = vertices[vsecond];
if((dfirst == *current_depth) && (dsecond == -1)){
vertices[vsecond] = dfirst +1;
*current_depth = dfirst+1;
*done = false;
}
if((dsecond == *current_depth) && (dfirst == -1)){
vertices[vfirst] = dsecond + 1;
*current_depth = dsecond +1;
*done = false;
}
}
这是我对struct Edge的定义
typedef struct Edge
{
int first;
int second;
}Edge;
这是调用两个内核的主要方法。
int main(int argc, char** argv){
const int NUM_VERTICES = 10000;
const size_t VERTEX_BYTES = NUM_VERTICES * sizeof(int);
const int NUM_EDGES = 10000;
const size_t EDGE_BYTES = NUM_EDGES * sizeof(Edge);
const int STARTING_VERTEX = 25;
cudaError_t err = cudaSuccess;
//declare the two arrays on host
int h_vertices[NUM_VERTICES];
Edge h_edges[NUM_EDGES];
//fill up the edges array
for (int i = 0; i < NUM_EDGES; ++i)
{
h_edges[i].first = (rand() % (NUM_VERTICES+1));
h_edges[i].second = (rand() % (NUM_VERTICES+1));
}
//define the two arrays on the device
Edge* d_edges;
int* d_vertices;
//Allocate memory on device for both arrays
err = cudaMalloc((void**)&d_edges, EDGE_BYTES);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate edges array on device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMalloc((void**)&d_vertices, VERTEX_BYTES);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate vertices array on device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_edges, h_edges, EDGE_BYTES, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy edges array from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_vertices, h_vertices, VERTEX_BYTES, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vertices array from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//assign thread configuration
int threadsPerBlock = 512;
int blocksPerGrid =(NUM_VERTICES + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
initialize_vertices<<<blocksPerGrid, threadsPerBlock>>>(d_vertices, STARTING_VERTEX);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch initialization kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Initialization completed\n");
bool h_done = true;
bool* d_done;
int h_current_depth = 0;
int* d_current_depth;
err = cudaMalloc((void**)&d_done, sizeof(bool));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocte d_done(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMalloc((void**)&d_current_depth, sizeof(int));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate d_current_depth(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//printf("Reached here\n");
while(!h_done){
printf("Entered while loop\n");
err = cudaMemcpy(d_done, &h_done, sizeof(bool), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy h_done to device(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_current_depth, &h_current_depth, sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch copy h_current_depth to kernel(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
bfs<<<blocksPerGrid, threadsPerBlock>>>(h_edges, h_vertices, d_current_depth, d_done);
//cudaThreadsSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch bfs kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(&h_done, d_done, sizeof(bool), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_done to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(&h_current_depth, d_current_depth, sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy d_current_depth to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//printf("Breadth first traversal completed over %d levels\n", h_current_depth);
cudaFree(d_edges);
cudaFree(d_vertices);
//cudaFree(d_done);
//cudaFree(d_current_depth);
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Done\n");
return 0;
}
代码编译很好,但不知何故,执行的线程永远不会进入while循环来启动第二个内核。 我是CUDA的新手,有人可以帮我解决这个问题。
答案 0 :(得分:2)
问题非常简单,与CUDA完全无关。您可能忽略了这个问题,或者不熟悉while
循环的概念。剥离代码版本:
bool h_done = true;
// ...
while(!h_done){
// ...
}
因为h_done
是true
,所以您真的不能指望输入while
循环。 !h_done == false
,即while
循环的条件始终为false
。如果你使用调试器,很快就会发现这个微不足道的问题。
但是,您的代码中可能还有很多其他问题,但我不能告诉您,因为您没有提供一个有效的示例,例如:类Edge
的定义。