我正在进行一项任务,要求使用CUDA并行化来优化this C程序。
这是我设法提出的:
//...
__global__ void gpu_score_function(void *gpu_frame_pixels, void *gpu_pattern_pixels, void *gpu_results,
int frame_rowstride, int pattern_rowstride,
int pattern_width, int pattern_height,
int frame_width, int frame_height) {
if ((blockIdx.y * blockDim.y + threadIdx.y < frame_height - pattern_height) &&
(blockIdx.x * blockDim.x + threadIdx.x < frame_width - pattern_width)) {
guchar *frame_pixels = (guchar *) gpu_frame_pixels +
(blockIdx.y * blockDim.y + threadIdx.y) * frame_rowstride +
(blockIdx.x * blockDim.x + threadIdx.x) * N_CHANNELS;
guchar *pattern_pixels = (guchar *) gpu_pattern_pixels;
int *results = (int *) gpu_results;
int res = 0;
for (int y = 0; y < pattern_height; ++y) {
if (blockIdx.y * blockDim.y + threadIdx.y + y < frame_height - pattern_height) {
for (int x = 0; x < pattern_width; ++x) {
if (blockIdx.x * blockDim.x + threadIdx.x + x < frame_width - pattern_width) {
const guchar *frame_pixel = frame_pixels + x * N_CHANNELS;
const guchar *pattern_pixel = pattern_pixels + x * N_CHANNELS;
for (int c = 0; c < N_CHANNELS; ++c) {
res += (frame_pixel[c] - 128) * (pattern_pixel[c] - 128);
}
} else {
break;
}
}
frame_pixels += frame_rowstride;
pattern_pixels += pattern_rowstride;
} else {
break;
}
}
results[(blockIdx.y * blockDim.y + threadIdx.y) * (frame_width - pattern_width) + blockIdx.x * blockDim.x + threadIdx.x] = res;
}
}
int main(int argc, const char *argv[]) {
//...
/**
* CUDA
*/
void *gpu_pattern_pixels;
void *gpu_frame_pixels;
void *gpu_results;
cudaMalloc(&gpu_pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar));
cudaMalloc(&gpu_frame_pixels, frame_height * frame_rowstride * sizeof(guchar));
cudaMalloc(&gpu_results, (frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy(gpu_pattern_pixels, (void *) pattern_pixels, pattern_height * pattern_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_frame_pixels, (void *) frame_pixels, frame_height * frame_rowstride * sizeof(guchar),
cudaMemcpyHostToDevice);
//Kernel configuration, where a two-dimensional grid and
//three-dimensional blocks are configured.
dim3 dimGrid(ceil((float) (frame_width - pattern_width) / 32), ceil((float) (frame_height - pattern_height) / 32));
dim3 dimBlock(32, 32);
gpu_score_function<<<dimGrid, dimBlock>>>(gpu_frame_pixels, gpu_pattern_pixels, gpu_results, frame_rowstride, pattern_rowstride, pattern_width, pattern_height, frame_width, frame_height);
cudaDeviceSynchronize();
int *results = (int *) malloc((frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int));
cudaMemcpy((void *) results, gpu_results,
(frame_width - pattern_width) * (frame_height - pattern_height) * sizeof(int), cudaMemcpyDeviceToHost);
int gpu_x_best, gpu_y_best;
double gpu_best_score;
for (int *cur = results; cur != results + (frame_width - pattern_width) * (frame_height - pattern_height); cur++) {
if (cur == results || *cur > gpu_best_score) {
gpu_best_score = *cur;
gpu_x_best = (cur - results) % (frame_width - pattern_width);
gpu_y_best = (cur - results) / (frame_width - pattern_width);
}
}
cudaFree(gpu_pattern_pixels);
cudaFree(gpu_frame_pixels);
cudaFree(gpu_results);
free(results);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
/**
* END CUDA
*/
//...
return 0;
}
程序没有段错误,cuda-memcheck给出0错误,结果矩阵被填充。 问题是,结果是错误的。
我很确定这是一个一个一个指针错误,但我不知道如何发现它。
我正在研究OSX 10.9,我可以使用哪些工具来调试这个程序?
感谢任何帮助。
答案 0 :(得分:-2)
我发现了这个错误。
gpu_score_function的for循环中的两个if语句没有意义。删除它们解决了这个问题。