Question

我正在进行一项任务，要求使用CUDA并行化来优化this C程序。

这是我设法提出的：

//...

__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
                                   int frame_rowstride, int pattern_rowstride,
                                   int pattern_width, int pattern_height,
                                   int frame_width, int frame_height) {
    if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
        (blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
        guchar *frame_pixels = (guchar *) gpu_frame_pixels +
                               (blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
                               (blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
        guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
        int *results = (int *) gpu_results;

        int res = 0;
        for (int y = 0; y < pattern_height; ++y) {
            if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
                for (int x = 0; x < pattern_width; ++x) {
                    if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
                        const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
                        const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
                        for (int c = 0; c < N_CHANNELS; ++c) {
                            res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
                        }
                    } else {
                        break;
                    }
                }
                frame_pixels += frame_rowstride;
                pattern_pixels += pattern_rowstride;
            } else {
                break;
            }
        }

        results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
    }
}

int main(int argc, const char *argv[]) {
    //...

    /**
     * CUDA
     */
    void *gpu_pattern_pixels;
    void *gpu_frame_pixels;
    void *gpu_results;

    cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
    cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
    cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));

    cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
               cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
               cudaMemcpyHostToDevice);

    //Kernel configuration, where a two-dimensional grid and
    //three-dimensional blocks are configured.
    dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
    dim3 dimBlock(32, 32);
    gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
    cudaDeviceSynchronize();

    int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
    cudaMemcpy((void *) results, gpu_results,
               (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);

    int gpu_x_best, gpu_y_best;
    double gpu_best_score;

    for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
        if (cur == results || *cur > gpu_best_score) {
            gpu_best_score = *cur;
            gpu_x_best = (cur - results) % (frame_width - pattern_width);
            gpu_y_best = (cur - results) / (frame_width - pattern_width);
        }
    }

    cudaFree(gpu_pattern_pixels);
    cudaFree(gpu_frame_pixels);
    cudaFree(gpu_results);
    free(results);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    /**
     * END CUDA
     */

    //...

    return 0;
}

程序没有段错误，cuda-memcheck给出0错误，结果矩阵被填充。问题是，结果是错误的。

我很确定这是一个一个一个指针错误，但我不知道如何发现它。

我正在研究OSX 10.9，我可以使用哪些工具来调试这个程序？

感谢任何帮助。

Answer 1

我发现了这个错误。

gpu_score_function的for循环中的两个if语句没有意义。删除它们解决了这个问题。

（家庭作业）将函数转换为CUDA内核函数

1 个答案: