我有一个包含两个无符号整数的结构数组。我想使用Bitonic Sorting根据第一个uint对它们进行排序。我实现了这段代码 here 在directX中(我将其转换为glsl)。一切正常,但性能却受到影响。 CPU排序的速度快了十倍(使用std :: sort),我错过了什么吗?
注意:这可以100%起作用,唯一的问题就是性能。我想这与同步线程和内存访问有关。
根据结构(Elem.c)中的第一个元素进行排序
重音排序(GLSL):
layout(local_size_x = 512) in;
struct Elem {
uint c;
uint p;
};
layout(std430, binding = 4) buffer IndexList {
Elem es[];
};
uniform uint u_Level;
uniform uint u_LevelMask;
shared Elem shared_data[512];
void main() {
// Load shared data
shared_data[gl_LocalInvocationIndex] = es[gl_GlobalInvocationID.x];
barrier();
// Sort the shared data
for (uint j = u_Level >> 1; j > 0; j >>= 1) {
Elem result = ((shared_data[gl_LocalInvocationIndex & ~j].c <= shared_data[gl_LocalInvocationIndex | j].c)
== bool(u_LevelMask & gl_GlobalInvocationID.x)) ?
shared_data[gl_LocalInvocationIndex ^ j] : shared_data[gl_LocalInvocationIndex];
barrier();
shared_data[gl_LocalInvocationIndex] = result;
barrier();
}
// Store shared data
es[gl_GlobalInvocationID.x] = shared_data[gl_LocalInvocationIndex];
}
和移调
#define XSIZE 16
#define YSIZE 16
layout(local_size_x = XSIZE ,local_size_y = YSIZE, local_size_z = 1) in;
struct Elem {
uint c;
uint p;
};
layout(std430, binding = 4) buffer InputBUF {
Elem inputElem[];
};
layout(std430, binding = 5) buffer OutputBUF {
Elem outputElem[];
};
uniform uint u_Width;
uniform uint u_Height;
shared Elem shared_data[XSIZE * YSIZE];
void main() {
shared_data[gl_LocalInvocationIndex] = inputElem[gl_GlobalInvocationID.y * u_Width + gl_GlobalInvocationID.x];
barrier();
uvec2 XY = gl_GlobalInvocationID.yx - gl_LocalInvocationID.yx + gl_LocalInvocationID.xy;
outputElem[XY.y * u_Height + XY.x] = shared_data[gl_LocalInvocationID.x * XSIZE + gl_LocalInvocationID.y];
}
#define BITONIC_BLOCK_SIZE 512
#define TRANSPOSE_BLOCK_SIZE 16
// The number of elements to sort is limited to an even power of 2
// At minimum 8,192 elements - BITONIC_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE
// At maximum 262,144 elements - BITONIC_BLOCK_SIZE * BITONIC_BLOCK_SIZE
const uint MATRIX_WIDTH = BITONIC_BLOCK_SIZE;
const uint MATRIX_HEIGHT = N / BITONIC_BLOCK_SIZE;
Bitonic->BindSSBO(*IndexList, "IndexList", 4);
for (uint level = 2; level <= BITONIC_BLOCK_SIZE; level <<= 1) {
Bitonic->SetUniform1ui("u_Level", level);
Bitonic->SetUniform1ui("u_LevelMask", level);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
}
for (uint level = BITONIC_BLOCK_SIZE << 1; level <= N; level <<= 1) {
// Transpose the data from buffer 1 into buffer 2
Transposer->BindSSBO(*IndexList, "InputBUF", 4);
Transposer->BindSSBO(*SecondaryIndexList, "OutputBUF", 5);
Transposer->SetUniform1ui("u_Width", MATRIX_WIDTH);
Transposer->SetUniform1ui("u_Height", MATRIX_HEIGHT);
Transposer->DispatchCompute(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, 1);
// Sort the transposed column data
Bitonic->BindSSBO(*SecondaryIndexList, "IndexList", 4);
Bitonic->SetUniform1ui("u_Level", level / BITONIC_BLOCK_SIZE);
Bitonic->SetUniform1ui("u_LevelMask", (level & ~N)/BITONIC_BLOCK_SIZE);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
// Transpose the data from buffer 2 back into buffer 1
Transposer->BindSSBO(*SecondaryIndexList, "InputBUF", 4);
Transposer->BindSSBO(*IndexList, "OutputBUF", 5);
Transposer->SetUniform1ui("u_Width", MATRIX_HEIGHT);
Transposer->SetUniform1ui("u_Height", MATRIX_WIDTH);
Transposer->DispatchCompute(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, 1);
// Sort the row data
Bitonic->BindSSBO(*IndexList, "IndexList", 4);
Bitonic->SetUniform1ui("u_Level", BITONIC_BLOCK_SIZE);
Bitonic->SetUniform1ui("u_LevelMask", level);
Bitonic->DispatchCompute(N / BITONIC_BLOCK_SIZE, 1, 1);
}
答案 0 :(得分:-1)
长篇评论,这是Wikipedia的有关双音排序算法的图片,但是是在CUDA中实现的:
const int n = 67108864; // 64M elements
const int l2n= 26; // log2(n)
// shared memory per block, also number of work per block (2048=minimum, 4096=moderate, 8192=maximum).
const int sharedSize= 8192;
const int l22k= 13; // log2(sharedSize)
__device__ void compareSwap(float & var1, float &var2, bool dir)
{
if(var1>var2 && dir)
{
float tmp = var1;
var1=var2;
var2=tmp;
}
else if(var1<var2 && !dir)
{
float tmp = var1;
var1=var2;
var2=tmp;
}
}
__global__ void computeBox(float * __restrict__ data, const int boxSize, const int leapSize)
{
const int index = (threadIdx.x + blockIdx.x*blockDim.x);
const bool dir = ((index%boxSize)<(boxSize/2));
const int indexOffset = (index / leapSize)*leapSize;
compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxForward(float * __restrict__ data, const int boxSize, const int leapSize)
{
const int index = (threadIdx.x + blockIdx.x*blockDim.x);
const bool dir = true;
const int indexOffset = (index / leapSize)*leapSize;
compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxShared(float * __restrict__ data, const int boxSize, const int leapSize, const int work)
{
const int index = threadIdx.x+work*1024;
const bool dir = ((index%boxSize)<(boxSize/2));
const int indexOffset = (index / leapSize)*leapSize;
compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void computeBoxForwardShared(float * __restrict__ data, const int boxSize, const int leapSize, const int work)
{
const int index = threadIdx.x + work*1024;
const bool dir = true;
const int indexOffset = (index / leapSize)*leapSize;
compareSwap(data[index+indexOffset],data[index+indexOffset+leapSize],dir);
}
__global__ void bitonicSharedSort(float * __restrict__ data)
{
const int offset = blockIdx.x * sharedSize;
__shared__ float sm[sharedSize];
const int nCopy = sharedSize / 1024;
const int nWork = sharedSize / 2048;
for(int i=0;i<nCopy;i++)
{
sm[threadIdx.x+i*1024] = data[threadIdx.x+offset+i*1024];
}
__syncthreads();
int boxSize = 2;
for(int i=0;i<l22k-1;i++)
{
for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
{
for(int work=0;work<nWork;work++)
{
computeBoxShared(sm,boxSize,leapSize,work);
}
__syncthreads();
}
boxSize*=2;
}
for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
{
for(int work=0;work<nWork;work++)
{
computeBoxForwardShared(sm,boxSize,leapSize,work);
}
__syncthreads();
}
for(int i=0;i<nCopy;i++)
{
data[threadIdx.x+offset+i*1024] = sm[threadIdx.x+i*1024];
}
}
__global__ void bitonicSharedMergeLeaps(float * __restrict__ data, const int boxSizeP, const int leapSizeP)
{
const int offset = blockIdx.x * sharedSize;
__shared__ float sm[sharedSize];
const int nCopy = sharedSize / 1024;
const int nWork = sharedSize / 2048;
for(int i=0;i<nCopy;i++)
{
sm[threadIdx.x+i*1024] = data[threadIdx.x+offset+i*1024];
}
__syncthreads();
for(int leapSize = leapSizeP;leapSize>0;leapSize /= 2)
{
for(int work=0;work<nWork;work++)
{
const int index = threadIdx.x+work*1024;
const int index2 = threadIdx.x+work*1024+blockIdx.x*blockDim.x*nWork;
const bool dir = ((index2%boxSizeP)<(boxSizeP/2));
const int indexOffset = (index / leapSize)*leapSize;
compareSwap(sm[index+indexOffset],sm[index+indexOffset+leapSize],dir);
}
__syncthreads();
}
for(int i=0;i<nCopy;i++)
{
data[threadIdx.x+offset+i*1024] = sm[threadIdx.x+i*1024];
}
}
// launch this with 1 cuda thread
// dynamic parallelism = needs something newer than cc v3.0
extern "C"
__global__ void bitonicSort(float * __restrict__ data)
{
bitonicSharedSort<<<(n/sharedSize),1024>>>(data);
int boxSize = sharedSize;
for(int i=l22k-1;i<l2n-1;i++)
{
if(boxSize>sharedSize)
{
int leapSize= boxSize/2;
for(;leapSize>sharedSize/2;leapSize /= 2)
{
computeBox<<<(n/1024)/2,1024>>>(data,boxSize,leapSize);
}
bitonicSharedMergeLeaps<<<(n/sharedSize),1024>>>(data,boxSize, leapSize);
}
else
{
bitonicSharedMergeLeaps<<<(n/sharedSize),1024>>>(data,boxSize, sharedSize/2);
}
boxSize*=2;
}
for(int leapSize = boxSize/2;leapSize>0;leapSize /= 2)
{
computeBoxForward<<<(n/1024)/2,1024>>>(data,boxSize,leapSize);
}
cudaDeviceSynchronize();
}
具有基准结果:
Array elements GT1030 std::sort GTX1080ti
(benchmark) (1 core ) (guesstimate)
(no overclock)
1024 not applicable -
2048 not applicable -
4096 not applicable -
8192 363 µs 114 µs -
16k 463 µs 248 µs -
32k 746 µs 536 µs -
64k 1.23 ms 1.15 ms -
128k 2.32 ms 2.46 ms -
256k 4.87 ms 5.4 ms ~1.5+ 0.3 ms
512k 8.72 ms 11.7 ms ~3 + 0.5 ms
1M 18.3 ms 22 ms ~6 + 1.2 ms
2M 39 ms 48 ms ~12 + 2.7 ms
4M 86 ms 101 ms ~23 + 6.3 ms
8M 187 ms 211 ms ~47 + 14 ms
16M 407 ms 451 ms ~95 + 32 ms
32M 883 ms 940 ms ~190+ 70 ms
64M 1.93 s 2.0 s ~380+ 150 ms
(float keys) (copy+kernel ) (copy + kernel)
(using same pcie)
pcie v2.0 4x: 1.4GB/s
fx8150 @ 3.6GHz
4GB RAM 1333MHz
(single channel DDR3)
性能提升始于128k元素,但对于不同的系统可能有所不同,即使实现没有真正优化,pci-e瓶颈或内核启动开销也将其再次限制在更大的阵列上。
没有pcie数据副本时,加速是从64k元素开始的,这是通过低端GPU进行的。