Question

我有一个包含两个无符号整数的结构数组。我想使用Bitonic Sorting根据第一个uint对它们进行排序。我实现了这段代码 here 在directX中（我将其转换为glsl）。一切正常，但性能却受到影响。 CPU排序的速度快了十倍（使用std :: sort），我错过了什么吗？

注意：这可以100％起作用，唯一的问题就是性能。我想这与同步线程和内存访问有关。

根据结构（Elem.c）中的第一个元素进行排序

重音排序（GLSL）：

layout(local_size_x = 512) in;

struct Elem {
    uint c;
    uint p;
};
layout(std430, binding = 4) buffer IndexList {
    Elem es[];
};

uniform uint u_Level;
uniform uint u_LevelMask;

shared Elem shared_data[512];


void main() {

    // Load shared data
    shared_data[gl_LocalInvocationIndex] = es[gl_GlobalInvocationID.x];

    barrier();

    // Sort the shared data
    for (uint j = u_Level >> 1; j > 0; j >>= 1) {

            Elem result = ((shared_data[gl_LocalInvocationIndex & ~j].c <= shared_data[gl_LocalInvocationIndex | j].c) 
                == bool(u_LevelMask & gl_GlobalInvocationID.x)) ? 
                shared_data[gl_LocalInvocationIndex ^ j] : shared_data[gl_LocalInvocationIndex];


            barrier();
            shared_data[gl_LocalInvocationIndex] = result;

            barrier();
    }

    // Store shared data
    es[gl_GlobalInvocationID.x] = shared_data[gl_LocalInvocationIndex]; 
}

和移调

#define XSIZE 16
#define YSIZE 16

layout(local_size_x = XSIZE ,local_size_y = YSIZE, local_size_z = 1) in;

struct Elem {
    uint c;
    uint p;
};
layout(std430, binding = 4) buffer InputBUF {
    Elem inputElem[];
};
layout(std430, binding = 5) buffer OutputBUF {
    Elem outputElem[];
};


uniform uint u_Width;
uniform uint u_Height;

shared Elem shared_data[XSIZE * YSIZE];

void main() {
    shared_data[gl_LocalInvocationIndex] = inputElem[gl_GlobalInvocationID.y * u_Width + gl_GlobalInvocationID.x];

    barrier();

    uvec2 XY = gl_GlobalInvocationID.yx - gl_LocalInvocationID.yx + gl_LocalInvocationID.xy;
    outputElem[XY.y * u_Height + XY.x] = shared_data[gl_LocalInvocationID.x * XSIZE + gl_LocalInvocationID.y];
}

#define BITONIC_BLOCK_SIZE 512
#define TRANSPOSE_BLOCK_SIZE 16

// The number of elements to sort is limited to an even power of 2
// At minimum 8,192 elements - BITONIC_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE
// At maximum 262,144 elements - BITONIC_BLOCK_SIZE * BITONIC_BLOCK_SIZE
    const uint MATRIX_WIDTH = BITONIC_BLOCK_SIZE;
    const uint MATRIX_HEIGHT = N / BITONIC_BLOCK_SIZE;

    Bitonic->BindSSBO(*IndexList, "IndexList", 4);
    for (uint level = 2; level <= BITONIC_BLOCK_SIZE; level <<= 1) {
        Bitonic->SetUniform1ui("u_Level", level);
        Bitonic->SetUniform1ui("u_LevelMask", level);

        Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
    }

    for (uint level = BITONIC_BLOCK_SIZE << 1; level <= N; level <<= 1) {
        // Transpose the data from buffer 1 into buffer 2

        Transposer->BindSSBO(*IndexList, "InputBUF", 4);
        Transposer->BindSSBO(*SecondaryIndexList, "OutputBUF", 5);
        Transposer->SetUniform1ui("u_Width", MATRIX_WIDTH);
        Transposer->SetUniform1ui("u_Height", MATRIX_HEIGHT);

        Transposer->DispatchCompute(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, 1);

        // Sort the transposed column data
        Bitonic->BindSSBO(*SecondaryIndexList, "IndexList", 4);
        Bitonic->SetUniform1ui("u_Level", level / BITONIC_BLOCK_SIZE);
        Bitonic->SetUniform1ui("u_LevelMask", (level & ~N)/BITONIC_BLOCK_SIZE);

        Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);

        // Transpose the data from buffer 2 back into buffer 1
        Transposer->BindSSBO(*SecondaryIndexList, "InputBUF", 4);
        Transposer->BindSSBO(*IndexList, "OutputBUF", 5);
        Transposer->SetUniform1ui("u_Width", MATRIX_HEIGHT);
        Transposer->SetUniform1ui("u_Height", MATRIX_WIDTH);

        Transposer->DispatchCompute(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, 1);

        // Sort the row data
        Bitonic->BindSSBO(*IndexList, "IndexList", 4);
        Bitonic->SetUniform1ui("u_Level", BITONIC_BLOCK_SIZE);
        Bitonic->SetUniform1ui("u_LevelMask", level);

        Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);

    }

Answer 1

长篇评论，这是Wikipedia的有关双音排序算法的图片，但是是在CUDA中实现的：

const int n = 67108864; // 64M elements
const int l2n= 26;  // log2(n)


// shared memory per block, also number of work per block (2048=minimum, 4096=moderate, 8192=maximum).
const int sharedSize= 8192; 
const int l22k= 13; // log2(sharedSize)
__device__ void compareSwap(float & var1, float &var2, bool dir)
{
     if(var1>var2 && dir)
     {                
            float tmp = var1;
            var1=var2;
            var2=tmp;         
     }
     else if(var1<var2 && !dir)
     {
            float tmp = var1;
            var1=var2;
            var2=tmp;   
     }
}
__global__ void computeBox(float * __restrict__ data, const int boxSize, const int leapSize)
{
      const int index = (threadIdx.x + blockIdx.x*blockDim.x);
      const bool dir = ((index%boxSize)<(boxSize/2));
      const int indexOffset = (index / leapSize)*leapSize;

      compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxForward(float * __restrict__ data, const int boxSize, const int leapSize)
{
      const int index = (threadIdx.x + blockIdx.x*blockDim.x);
      const bool dir = true;
      const int indexOffset = (index / leapSize)*leapSize;

      compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxShared(float * __restrict__ data, const int boxSize, const int leapSize, const int work)
{
      const int index = threadIdx.x+work*1024;
      const bool dir = ((index%boxSize)<(boxSize/2));
      const int indexOffset = (index / leapSize)*leapSize;

      compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxForwardShared(float * __restrict__ data, const int boxSize, const int leapSize, const int work)
{
      const int index = threadIdx.x + work*1024;
      const bool dir = true;
      const int indexOffset = (index / leapSize)*leapSize;

      compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void bitonicSharedSort(float * __restrict__ data)
{
     const int offset = blockIdx.x * sharedSize;
     __shared__ float sm[sharedSize];
     const int nCopy = sharedSize / 1024;
     const int nWork = sharedSize / 2048;
     for(int i=0;i<nCopy;i++)
     {
         sm[threadIdx.x+i*1024]      = data[threadIdx.x+offset+i*1024];
     }
     __syncthreads();
     int boxSize = 2;
     for(int i=0;i<l22k-1;i++)
     {                       
           for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
           {                             
                for(int work=0;work<nWork;work++)
                {                  
                    computeBoxShared(sm,boxSize,leapSize,work);
                }                          
                __syncthreads();
           }
           boxSize*=2;
     }

     for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
     {           
           for(int work=0;work<nWork;work++)
           {         
               computeBoxForwardShared(sm,boxSize,leapSize,work);
           }                 
           __syncthreads();     
     }

     for(int i=0;i<nCopy;i++)
     {
           data[threadIdx.x+offset+i*1024] = sm[threadIdx.x+i*1024];                          
     }
}
__global__ void bitonicSharedMergeLeaps(float * __restrict__ data, const int boxSizeP, const int leapSizeP)
{
     const int offset = blockIdx.x * sharedSize;
     __shared__ float sm[sharedSize];
     const int nCopy = sharedSize / 1024;
     const int nWork = sharedSize / 2048;
     for(int i=0;i<nCopy;i++)
     {
         sm[threadIdx.x+i*1024] = data[threadIdx.x+offset+i*1024];       
     }
     __syncthreads();

    for(int leapSize = leapSizeP;leapSize>0;leapSize /= 2)
    {                                               
        for(int work=0;work<nWork;work++)
        {
            const int index = threadIdx.x+work*1024;
            const int index2 = threadIdx.x+work*1024+blockIdx.x*blockDim.x*nWork;
            const bool dir = ((index2%boxSizeP)<(boxSizeP/2));
            const int indexOffset = (index / leapSize)*leapSize;

            compareSwap(sm[index+indexOffset],sm[index+indexOffset+leapSize],dir);
        }                          
        __syncthreads();
    }

    for(int i=0;i<nCopy;i++)
    {
        data[threadIdx.x+offset+i*1024] = sm[threadIdx.x+i*1024];                               
    }
}

// launch this with 1 cuda thread
// dynamic parallelism = needs something newer than cc v3.0
extern "C"
__global__ void bitonicSort(float * __restrict__ data)
{     
     bitonicSharedSort<<<(n/sharedSize),1024>>>(data);
     int boxSize = sharedSize;
     for(int i=l22k-1;i<l2n-1;i++)
     {
               if(boxSize>sharedSize)
               {
                   int leapSize= boxSize/2;
                   for(;leapSize>sharedSize/2;leapSize /= 2)
                   {                                               
                      computeBox<<<(n/1024)/2,1024>>>(data,boxSize,leapSize);                                                                     
                   }
                   bitonicSharedMergeLeaps<<<(n/sharedSize),1024>>>(data,boxSize, leapSize);
               }
               else
               {
                  bitonicSharedMergeLeaps<<<(n/sharedSize),1024>>>(data,boxSize, sharedSize/2);
               }
           boxSize*=2;
     }

     for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
     {                    
           computeBoxForward<<<(n/1024)/2,1024>>>(data,boxSize,leapSize);                 

     }       
     cudaDeviceSynchronize();                 
}

具有基准结果：

Array elements  GT1030          std::sort           GTX1080ti 
                (benchmark)     (1 core )           (guesstimate)
                (no overclock)
1024            not applicable                      -
2048            not applicable                      -
4096            not applicable                      -
8192            363     µs      114     µs          -
16k             463     µs      248     µs          -
32k             746     µs      536     µs          -
64k             1.23    ms      1.15    ms          -
128k            2.32    ms      2.46    ms          -
256k            4.87    ms      5.4     ms      ~1.5+ 0.3   ms
512k            8.72    ms      11.7    ms      ~3  + 0.5   ms
1M              18.3    ms      22      ms      ~6  + 1.2   ms
2M              39      ms      48      ms      ~12 + 2.7   ms
4M              86      ms      101     ms      ~23 + 6.3   ms
8M              187     ms      211     ms      ~47 + 14    ms
16M             407     ms      451     ms      ~95 + 32    ms
32M             883     ms      940     ms      ~190+ 70    ms
64M             1.93    s       2.0     s       ~380+ 150   ms
(float keys)    (copy+kernel )                  (copy + kernel)
                                                (using same pcie)
pcie v2.0 4x: 1.4GB/s
fx8150 @ 3.6GHz
4GB RAM 1333MHz
(single channel DDR3)

性能提升始于128k元素，但对于不同的系统可能有所不同，即使实现没有真正优化，pci-e瓶颈或内核启动开销也将其再次限制在更大的阵列上。

没有pcie数据副本时，加速是从64k元素开始的，这是通过低端GPU进行的。

GPU Bitonic排序比std :: sort慢10倍

1 个答案: