Question

我正在尝试在CUDA上运行并行BFS算法

这是我的内核代码

__global__ void initialize_vertices(int* vertices, int starting_vertex){

    int v = blockDim.x * blockIdx.x + threadIdx.x;
    if( v == starting_vertex){
        vertices[v] = 0;        
    }else{
        vertices[v] = -1;
    } 
}

__global__ void bfs(const Edge* edges, int* vertices, int* current_depth, bool* done){

    int e = blockDim.x * blockIdx.x + threadIdx.x;
    int vfirst = edges[e].first;
    int dfirst = vertices[vfirst];
    int vsecond = edges[e].second;
    int dsecond = vertices[vsecond];

    if((dfirst == *current_depth) && (dsecond == -1)){
        vertices[vsecond] = dfirst +1;
        *current_depth = dfirst+1;
        *done = false;
    }
    if((dsecond == *current_depth) && (dfirst == -1)){
        vertices[vfirst] = dsecond + 1;
        *current_depth = dsecond +1;
        *done = false;
    }
}

这是我对struct Edge的定义

typedef struct Edge
{
    int first;
    int second;

}Edge;

这是调用两个内核的主要方法。

int main(int argc, char** argv){

    const int NUM_VERTICES = 10000;
    const size_t VERTEX_BYTES = NUM_VERTICES * sizeof(int);
    const int NUM_EDGES = 10000;
    const size_t EDGE_BYTES = NUM_EDGES * sizeof(Edge);
    const int STARTING_VERTEX = 25;
    cudaError_t err = cudaSuccess;

    //declare the two arrays on host
    int h_vertices[NUM_VERTICES];
    Edge h_edges[NUM_EDGES];


    //fill up the edges array
    for (int i = 0; i < NUM_EDGES; ++i)   
    {
        h_edges[i].first = (rand() % (NUM_VERTICES+1));
        h_edges[i].second = (rand() % (NUM_VERTICES+1));
    }

    //define the two arrays on the device
    Edge* d_edges;
    int* d_vertices;

    //Allocate memory on device for both arrays
    err = cudaMalloc((void**)&d_edges, EDGE_BYTES);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate edges array on device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaMalloc((void**)&d_vertices, VERTEX_BYTES);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate vertices array on device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


    err = cudaMemcpy(d_edges, h_edges, EDGE_BYTES, cudaMemcpyHostToDevice);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy edges array from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaMemcpy(d_vertices, h_vertices, VERTEX_BYTES, cudaMemcpyHostToDevice);
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vertices array from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    //assign thread configuration
    int threadsPerBlock = 512;
    int blocksPerGrid =(NUM_VERTICES + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    initialize_vertices<<<blocksPerGrid, threadsPerBlock>>>(d_vertices, STARTING_VERTEX);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch initialization kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    printf("Initialization completed\n");
    bool h_done = true;
    bool* d_done;

    int h_current_depth = 0;
    int* d_current_depth;

    err = cudaMalloc((void**)&d_done, sizeof(bool));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocte d_done(error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaMalloc((void**)&d_current_depth, sizeof(int));
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate d_current_depth(error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    //printf("Reached here\n");

    while(!h_done){
        printf("Entered while loop\n");
        err = cudaMemcpy(d_done, &h_done, sizeof(bool), cudaMemcpyHostToDevice);
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy h_done to device(error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

        err = cudaMemcpy(d_current_depth, &h_current_depth, sizeof(int), cudaMemcpyHostToDevice);
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to launch copy h_current_depth to kernel(error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

        printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

        bfs<<<blocksPerGrid, threadsPerBlock>>>(h_edges, h_vertices, d_current_depth, d_done);
        //cudaThreadsSynchronize();
        err = cudaGetLastError();
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to launch bfs kernel (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

        err = cudaMemcpy(&h_done, d_done, sizeof(bool), cudaMemcpyHostToDevice);
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy d_done to host (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

        err = cudaMemcpy(&h_current_depth, d_current_depth, sizeof(int), cudaMemcpyHostToDevice);
        if (err != cudaSuccess)
        {
            fprintf(stderr, "Failed to copy d_current_depth to host (error code %s)!\n", cudaGetErrorString(err));
            exit(EXIT_FAILURE);
        }

    }
    //printf("Breadth first traversal completed over %d levels\n", h_current_depth);
    cudaFree(d_edges);
    cudaFree(d_vertices);
    //cudaFree(d_done);
    //cudaFree(d_current_depth);
    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");

    return 0;

}

代码编译很好，但不知何故，执行的线程永远不会进入while循环来启动第二个内核。我是CUDA的新手，有人可以帮我解决这个问题。

Answer 1

问题非常简单，与CUDA完全无关。您可能忽略了这个问题，或者不熟悉while循环的概念。剥离代码版本：

bool h_done = true;
// ...
while(!h_done){
    // ...
}

因为h_done是true，所以您真的不能指望输入while循环。 !h_done == false，即while循环的条件始终为false。如果你使用调试器，很快就会发现这个微不足道的问题。

但是，您的代码中可能还有很多其他问题，但我不能告诉您，因为您没有提供一个有效的示例，例如：类Edge的定义。

循环时CUDA代码无法进入

1 个答案: