Cuda共享内存错误

时间:2015-03-24 13:04:40

标签: cuda parallel-processing

我有这个简单的基数排序(它只排序一点,只有一个块)。我的第一个版本可以工作,但我尝试先对共享内存中的密钥进行排序,以实现对DRAM的合并内存写入。但是这个版本会产生不好的结果,它不会排序。

首先,简单,有效的版本:

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
    int a=s_sum[pos];
    int b=s_sum[tid];
    __syncthreads();
    s_sum[tid]=a+b;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}

out_keys[offset]=in_keys[tid];

}

第二版:

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
    int a=s_sum[pos];
    int b=s_sum[tid];
    __syncthreads();
    s_sum[tid]=a+b;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}

__syncthreads();
s_sum[offset]=in_keys[tid];
__syncthreads();
out_keys[tid]=s_sum[tid];

}

1 个答案:

答案 0 :(得分:0)

问题是我在条件代码上调用__syncthreads()。只允许在条件代码上调用__syncthreads(),该代码对块中的所有线程具有相同的执行路径。 正确的版本:

__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
__shared__ unsigned int s_sum[1024];
int tid=threadIdx.x;

//Get offset by using prefix sum scan.
unsigned int v=in_keys[tid];
unsigned int first=((v>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; off<1024;){
    int a,b;
    if (pos>=0){
        a=s_sum[pos];
        b=s_sum[tid];
    }
    __syncthreads();
    if (pos>=0){
        s_sum[tid]=a+b;
    }
    __syncthreads();
    off=off*2;
    pos=tid-off;
}
__syncthreads();

int offset=s_sum[tid]-first;
if (first==0){
    //Get offset for '1' bit keys
    offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=v;
__syncthreads();
out_keys[tid]=s_sum[tid];

}