美好的一天!
根据我对CUDA编程指南的理解,无论是使用纹理还是表面访问,读取都通过纹理缓存进行路由。但是,当我在NSIGHT上运行Memory实验时,它表示纹理缓存命中率为0%,而使用纹理使用同一内核使命中率达到99%。
内核用于小型人口普查变换,其中访问的像素为:
x x r x x
x x r x x
r x o x r
x x r x x
x x r x x
哪里" r"代表锚像素周围的像素" o"这是阅读。
这是内核和存根:
__global__ void census_transform_kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height){
//16 x 16 threads
int row_to_access = blockIdx.y * blockDim.y + threadIdx.y;
int col_to_access = blockIdx.x * blockDim.x + threadIdx.x;
if (row_to_access < height && col_to_access < width){
unsigned char ref = surf2Dread<unsigned char>(input, col_to_access, row_to_access, cudaBoundaryModeTrap);
unsigned char sum =
((surf2Dread<unsigned char>(input, col_to_access, row_to_access - 2, cudaBoundaryModeZero) > ref) << 5) +
((surf2Dread<unsigned char>(input, col_to_access, row_to_access - 1, cudaBoundaryModeZero) > ref) << 4) +
((surf2Dread<unsigned char>(input, col_to_access - 2, row_to_access, cudaBoundaryModeZero) > ref) << 3) +
((surf2Dread<unsigned char>(input, col_to_access + 2, row_to_access, cudaBoundaryModeZero) > ref) << 2) +
((surf2Dread<unsigned char>(input, col_to_access, row_to_access + 1, cudaBoundaryModeZero) > ref) << 1) +
((surf2Dread<unsigned char>(input, col_to_access, row_to_access + 2, cudaBoundaryModeZero) > ref));
surf2Dwrite(sum, output, col_to_access, row_to_access, cudaBoundaryModeTrap);
}
}
void census_transform(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height){
dim3 threads(16, 16);
dim3 blocks(DIVIDE_UP(width, threads.x), DIVIDE_UP(width, threads.y));
census_transform_kernel<<<blocks, threads>>>(input, output, width, height);
SAFE_CALL(cudaDeviceSynchronize(), "Census transform failed.");
}
int main(){
cv::Mat left_im = cv::imread("im.png", cv::IMREAD_GRAYSCALE);
cudaArray *d_left_array;
cudaArray *d_right_array;
cudaChannelFormatDesc left_array_channel_desc = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&d_left_array, &left_array_channel_desc, width * sizeof(unsigned char), height, cudaArraySurfaceLoadStore);
cudaChannelFormatDesc left_census_array_channel_desc = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&d_left_census_array, &left_census_array_channel_desc, width * sizeof(unsigned char), height, cudaArraySurfaceLoadStore);
cudaResourceDesc left_array_resc;
memset(&left_array_resc, 0, sizeof(left_array_resc));
cudaResourceDesc left_census_array_resc;
memset(&left_census_array_resc, 0, sizeof(left_census_array_resc));
left_array_resc.resType = cudaResourceTypeArray;
left_array_resc.res.array.array = d_left_array;
left_census_array_resc.resType = cudaResourceTypeArray;
left_census_array_resc.res.array.array = d_left_census_array;
cudaSurfaceObject_t left_array_tex = 0;
cudaCreateSurfaceObject(&left_array_tex, &left_array_resc);
cudaSurfaceObject_tleft_census_array_tex = 0;
cudaCreateSurfaceObject(&left_census_array_tex, &left_census_array_resc);
cudaMemcpyToArray(d_left_array, 0, 0, left_im.data, height * width * sizeof(unsigned char), cudaMemcpyHostToDevice);
census_transform(left_array_tex, left_census_array_tex, width, height);
return 0;
}