使用CUDA曲面的0%纹理缓存命中率

时间:2015-07-18 12:28:48

标签: caching memory cuda textures surface

美好的一天!

根据我对CUDA编程指南的理解,无论是使用纹理还是表面访问,读取都通过纹理缓存进行路由。但是,当我在NSIGHT上运行Memory实验时,它表示纹理缓存命中率为0%,而使用纹理使用同一内核使命中率达到99%。

内核用于小型人口普查变换,其中访问的像素为:

x x r x x

x x r x x

r x o x r

x x r x x

x x r x x

哪里" r"代表锚像素周围的像素" o"这是阅读。

这是内核和存根:

__global__ void census_transform_kernel(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height){
    //16 x 16 threads
    int row_to_access = blockIdx.y * blockDim.y + threadIdx.y;
    int col_to_access = blockIdx.x * blockDim.x + threadIdx.x;

    if (row_to_access < height && col_to_access < width){
        unsigned char ref = surf2Dread<unsigned char>(input, col_to_access, row_to_access, cudaBoundaryModeTrap);
        unsigned char sum = 
            ((surf2Dread<unsigned char>(input, col_to_access, row_to_access - 2, cudaBoundaryModeZero) > ref) << 5) +
            ((surf2Dread<unsigned char>(input, col_to_access, row_to_access - 1, cudaBoundaryModeZero) > ref) << 4) +
            ((surf2Dread<unsigned char>(input, col_to_access - 2, row_to_access, cudaBoundaryModeZero) > ref) << 3) +
            ((surf2Dread<unsigned char>(input, col_to_access + 2, row_to_access, cudaBoundaryModeZero) > ref) << 2) +
            ((surf2Dread<unsigned char>(input, col_to_access, row_to_access + 1, cudaBoundaryModeZero) > ref) << 1) +
            ((surf2Dread<unsigned char>(input, col_to_access, row_to_access + 2, cudaBoundaryModeZero) > ref));

        surf2Dwrite(sum, output, col_to_access, row_to_access, cudaBoundaryModeTrap);
    }
}

void census_transform(cudaSurfaceObject_t input, cudaSurfaceObject_t output, int width, int height){
    dim3 threads(16, 16);
    dim3 blocks(DIVIDE_UP(width, threads.x), DIVIDE_UP(width, threads.y));

    census_transform_kernel<<<blocks, threads>>>(input, output, width, height);
    SAFE_CALL(cudaDeviceSynchronize(), "Census transform failed.");
}


int main(){
  cv::Mat left_im = cv::imread("im.png", cv::IMREAD_GRAYSCALE);

    cudaArray *d_left_array;
    cudaArray *d_right_array;

  cudaChannelFormatDesc left_array_channel_desc = cudaCreateChannelDesc<unsigned char>(); 
  cudaMallocArray(&d_left_array, &left_array_channel_desc, width * sizeof(unsigned char), height, cudaArraySurfaceLoadStore);

  cudaChannelFormatDesc left_census_array_channel_desc = cudaCreateChannelDesc<unsigned char>();  
  cudaMallocArray(&d_left_census_array, &left_census_array_channel_desc, width * sizeof(unsigned char), height, cudaArraySurfaceLoadStore);

  cudaResourceDesc left_array_resc; 
  memset(&left_array_resc, 0, sizeof(left_array_resc));
  cudaResourceDesc left_census_array_resc; 
  memset(&left_census_array_resc, 0, sizeof(left_census_array_resc));

  left_array_resc.resType = cudaResourceTypeArray; 
  left_array_resc.res.array.array = d_left_array;
  left_census_array_resc.resType = cudaResourceTypeArray; 
  left_census_array_resc.res.array.array = d_left_census_array;

  cudaSurfaceObject_t left_array_tex = 0; 
  cudaCreateSurfaceObject(&left_array_tex, &left_array_resc);
  cudaSurfaceObject_tleft_census_array_tex = 0; 
  cudaCreateSurfaceObject(&left_census_array_tex, &left_census_array_resc);

  cudaMemcpyToArray(d_left_array, 0, 0, left_im.data, height * width * sizeof(unsigned char), cudaMemcpyHostToDevice);

    census_transform(left_array_tex, left_census_array_tex, width, height);

    return 0;
}

0 个答案:

没有答案