Question

我在CUDA中实现了一个2D中值滤波器，整个程序如下所示。

#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>  
#include <fstream>   
#include <iomanip>   
#include <windows.h>
#include <io.h>                  
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;

#define MEDIAN_DIMENSION  3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......   
#define MEDIAN_LENGTH 9   // Shoul be  MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3

#define BLOCK_WIDTH 16  // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]

 __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){

      __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];

    int iterator;
    const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
    int StartPoint=MEDIAN_DIMENSION/2;
    int EndPoint=StartPoint+1;

    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;

    const int tid=threadIdx.y*blockDim.y+threadIdx.x;   

      if(x>=Image_Width || y>=Image_Height)
        return;

     //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
            if (x == 0 || x == Image_Width - StartPoint || y == 0
                || y == Image_Height - StartPoint) {
            } else {             
                iterator = 0;
                for (int r = x - StartPoint; r < x + (EndPoint); r++) {
                    for (int c = y - StartPoint; c < y + (EndPoint); c++) {
                        surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
                        iterator++;
                    }
                }
//Sort the Surround Array to Find Median. Use Bubble Short  if Matrix oF 3 x 3 Matrix 
                    //You can use Insertion commented below to Short Bigger Dimension Matrix  

                              ////      bubble short //

                    for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
                    {     
                        // Find position of minimum element
                        int min=i;
                        for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
                            if (surround[tid][l] <surround[tid][min] )
                                min=l;
                        // Put found minimum element in its place
                        unsigned short  temp= surround[tid][i];
                        surround[tid][i]=surround[tid][min];
                        surround[tid][min]=temp;
                    }//bubble short  end

                    //////insertion sort start   //

                    /*int t,j,i;
                    for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
                        j = i;
                        while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
                            t= surround[tid][j];
                            surround[tid][j]= surround[tid][j-1];
                            surround[tid][j-1] = t;
                            j--;
                        }
                    }*/

                    ////insertion sort end   



                    *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1];   // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
                        __syncthreads();
            }  
}

  int main( int argc, const char** argv )
{
    int dataLength;
    int p1;
    unsigned short* Host_ImageData = NULL;
    ifstream is; // Read File 
    is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );

    // get length of file:
    is.seekg (0, ios::end);
    dataLength = is.tellg();
    is.seekg (0, ios::beg);

    Host_ImageData = new  unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read ((char*)Host_ImageData,dataLength);
    is.close();

    int Image_Width = 1580;
    int Image_Height = 1050;

    unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
    unsigned short *Device_ImageData = NULL;

    /////////////////////////////
    // As First time cudaMalloc take more time  for memory alocation, i dont want to cosider this time in my process. 
    //So Please Ignore Code For Displaying First CudaMelloc Time
    clock_t begin = clock();
    unsigned short *forFirstCudaMalloc = NULL;
    cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"First CudaMelloc time = "<<elapsed_secs<<"  Second\n" ;
    cudaFree( forFirstCudaMalloc );
    ////////////////////////////

    //Actual Process Starts From Here 
    clock_t beginOverAll = clock();   //
    cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); 
    cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering

    int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));

    const dim3 grid (x, y, 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    begin = clock();

    MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
    cudaDeviceSynchronize();

    end = clock();
    elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"Process time = "<<elapsed_secs<<"  Second\n" ;

    cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done

    clock_t endOverall = clock();
    elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
    cout<<"Complete Time  = "<<elapsed_secs<<"  Second\n" ;

    ofstream of2;   //Write Filtered Image Into File
    of2.open("D:\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Host_ResultData,dataLength);
    of2.close();
    cout<<"\nEnd of Writing File.  Press Any Key To Exit..!!";
    cudaFree(Device_ImageData);
    delete Host_ImageData;
    delete Host_ResultData;

    getch();
    return 0;
}

Here是我使用的文件的链接。我使用ImajeJ以“原始”格式存储图像，并使用相同的方式读取“原始”图像。我的图片像素为16位，unsigned short。图片的宽度为1580，高度为1050。

我坚信使用适当的CUDA优化可以提高过滤效率。

事实上，我正在运行GeForce GT 520M卡，时间如下

1）MEDIAN_DIMENSION的{{1}}

2）3 x 3 = 0.027 seconds的{{1}}

3）MEDIAN_DIMENSION 5 x 5 = 0.206 seconds

4）MEDIAN_DIMENSION 7 x 7 = 1.11 seconds

正如您所看到的，随着我们增加MEDIAN_DIMENSION，时间会增加很多，而且我的应用程序通常使用较高的9 x 9 = 4.931 seconds MEDIAN_DIMENSION和MEDIAN_DIMENSION。我认为，通过使用Cuda，即使是7 x 7，时间也应该小于9 x 9。

由于我认为排序部分大部分时间都在这里，我们能否让算法的排序部分更快？

我们可以更有效地使用9 x 9和1 second吗？我可以使用较大的grid和block（例如BLOCK_WIDTH和BLOCK_HEIGHT）但仍未达到32的最大32内存限制我的设备？

可以更有效地使用__shared__内存吗？

任何帮助将不胜感激。

提前致谢。

Answer 1

我回答了关于使用共享内存的最后一个问题。

正如Eric已经注意到的，你对共享内存的使用并没有真正导致线程协作。

我将3x3案例的解决方案与您的内核变体（根本不使用共享内存）以及2D median filtering in CUDA: how to efficiently copy global memory to shared memory中讨论的Accelereyes解决方案进行比较。

以下是完整的代码：

#include <iostream>  
#include <fstream>   

using namespace std;

#define BLOCK_WIDTH 16 
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
    const int tx_l = threadIdx.x;                           // --- Local thread x index
    const int ty_l = threadIdx.y;                           // --- Local thread y index

    const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
    const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

    // --- Fill the shared memory border with zeros
    if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
    else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
    if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
        if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
        }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
        if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
    }

    // --- Fill shared memory
                                                                    smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
    if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
    if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
            if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
         } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
         if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
    }
    __syncthreads();

    // --- Pull the 3x3 window in a local array
    unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                            smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                            smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };    

    // --- Bubble-sort
    for (int i = 0; i < 5; i++) {
        for (int j = i + 1; j < 9; j++) {
            if (v[i] > v[j]) { // swap?
                unsigned short tmp = v[i];
                v[i] = v[j];
                v[j] = tmp;
            }
         }
    }

    // --- Pick the middle one
    Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill shared memory
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[tid][iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort shared memory to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[tid][i];
        surround[tid][i]=surround[tid][minval];
        surround[tid][minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[tid][4]; 

    __syncthreads();

}

/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    unsigned short surround[9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill array private to the threads
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort private array to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[i];
        surround[i]=surround[minval];
        surround[minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[4]; 

}

/********/
/* MAIN */
/********/
int main()
{
    const int Image_Width = 1580;
    const int Image_Height = 1050;

    // --- Open data file
    ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

    // --- Get file length
    is.seekg(0, ios::end);
    int dataLength = is.tellg();
    is.seekg(0, ios::beg);

    // --- Read data from file and close file
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read((char*)Input_Image_Host,dataLength);
    is.close();

    // --- CUDA warm up
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
    gpuErrchk(cudaFree(forFirstCudaMalloc));

    // --- Allocate host and device memory spaces 
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
    unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short))); 
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 

    // --- Copy data from host to device
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

    // --- Grid and block sizes
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
    Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - elapsed time:  %3.3f ms \n", time);

    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
    Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);

    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);

    // --- Copy results back to the host
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

    // --- Open results file, write results and close the file
    ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Output_Image_Host, dataLength);
    of2.close();

    cout << "\n Press Any Key To Exit..!!";
    gpuErrchk(cudaFree(Input_Image));

    delete Input_Image_Host;
    delete Output_Image_Host;

    return 0;
}

以下是 Kepler K20c 的时间结果：

1580 x 1050
Original_Kernel_Function             = 1.588ms
Original_Kernel_Function_no_shared   = 1.278ms
Optimized_Kernel_Function_shared     = 1.455ms

2048 x 2048
Original_Kernel_Function             = 3.94ms
Original_Kernel_Function_no_shared   = 3.118ms
Optimized_Kernel_Function_shared     = 3.709ms

4096 x 4096
Original_Kernel_Function             = 16.003ms
Original_Kernel_Function_no_shared   = 13.735ms
Optimized_Kernel_Function_shared     = 14.526ms

8192 x 8192
Original_Kernel_Function             = 62.278ms
Original_Kernel_Function_no_shared   = 47.484ms
Optimized_Kernel_Function_shared     = 57.474ms

以下是 GT540M 的时间效果，与您的卡更相似：

1580 x 1050
Original_Kernel_Function             = 10.332 ms
Original_Kernel_Function_no_shared   =  9.294 ms
Optimized_Kernel_Function_shared     = 10.301 ms

2048 x 2048
Original_Kernel_Function             = 25.256 ms
Original_Kernel_Function_no_shared   = 23.567 ms
Optimized_Kernel_Function_shared     = 23.876 ms

4096 x 4096
Original_Kernel_Function             = 99.791 ms
Original_Kernel_Function_no_shared   = 93.919 ms
Optimized_Kernel_Function_shared     = 95.464 ms

8192 x 8192
Original_Kernel_Function             = 399.259 ms
Original_Kernel_Function_no_shared   = 375.634 ms
Optimized_Kernel_Function_shared     = 383.121 ms

可以看出，在所有情况下，版本不使用共享内存似乎（稍微）方便。

Answer 2

似乎你在使用共享内存的线程之间没有任何共享，即对于3x3过滤器，你从全局内存中读取每个像素9次，这是不必要的。本白皮书可能会提供有关如何在卷积内核中使用共享内存的一些想法。希望它有所帮助。

http://docs.nvidia.com/cuda/samples/3_Imaging/convolutionSeparable/doc/convolutionSeparable.pdf

Answer 3

Quickselect median是最佳情况下最快的线性时间算法;但是，由于内存开销，很难在CUDA中实现。高度并行算法的最简单方法是最小化内存开销。而不是像Quickselect那样的部分排序算法，完全通过使用Torben中值算法来消除内存开销。 Torben算法可能比其他算法慢得多，但它不会修改输入数据。因此，不需要分配共享内存。

最后，为了获得最大速度，请将输入绑定到纹理，这样可以额外管理边框扩展。为了最大限度地减少缓存未命中，请对行主阵列使用行主要嵌套for循环。

Answer 4

两个提示：

更改双循环迭代的顺序：首先在y（外循环）中迭代，然后在x（内循环）中迭代。这是最重要的修复，因为它适用于您将实现的任何双循环。您希望确保连续读取尽可能彼此接近（在不同设备上有不同的原因，例如对于缓存最重要的单线程CPU代码，对于合并内存访问最重要的GPU，并且可能是缓存）。由于您首先迭代行，因此您现在有0个合并，每个像素有效地发送一个2字节（像素宽度）请求。 Make sure to read this thread关于此事，即使它只解释了CPU方面的问题。
确保您的读取已合并。在您的示例中，即使您修复了循环，也只能读取block_width * pixel_width，即一次读取16 x 2 = 32个连续字节。这将需要更高的延迟隐藏占用率，而不是一次读取128个字节。您可以通过使用更宽的块来改进事物（更宽的块通常更好用于这个原因）。还要确保您的读数对齐。这与前一点有关，并在[CUDA C编程指南的这一部分] [7]中进行了解释。

编辑：我移动了答案的其余部分here。

2D CUDA中值滤波器优化

4 个答案: