共享内存中有一个长度为N的数组。数组稀疏(有很多零元素)。
__shared__ int arr[N];
目标是该块中的所有线程(例如128个线程)都应找到第一个索引j,其中N> j> = a并且arr [j]为非零。显而易见的方法是:
for(int i = a; i < N; i++){
if(!arr[i]){
j = i;
break;
}
}
但是,此方法不是并行的(线程不配合),并且存在严重的库冲突。我想知道是否有更有效的方法。
答案 0 :(得分:1)
这是一个parallel reduction问题。您基本上可以通过适当的谓词测试找到最小值。我将编写一些代码进行演示。为了简单起见,我将做一些简化的假设,例如我们想做一个简单的共享内存减少(相对于warp-shuffle):
const int nTPB = 128;
// assume nTPB, number of threads per block, is a power of 2
...
__shared__ int arr[N];
__shared__ int red[nTPB];
// code which populates arr ...
// ...
red[threadIdx.x] = N;
__syncthreads();
// perform test
for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
int t1 = arr[ridx]?ridx:N;
red[threadIdx.x] = min(t1, red[threadIdx.x]);}
__syncthreads();
// standard min-finding sweep reduction in shared memory
for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
__syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
这是一个快速的测试案例:
$ cat t1640.cu
#include <iostream>
const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
__shared__ int arr[N];
__shared__ int red[nTPB];
// code which populates arr ...
for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
arr[ridx] = d[ridx];
red[threadIdx.x] = N;
__syncthreads();
// perform test
for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
int t1 = arr[ridx]?ridx:N;
red[threadIdx.x] = min(t1, red[threadIdx.x]);}
__syncthreads();
// standard min-finding sweep reduction in shared memory
for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
__syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
if (!threadIdx.x) *r = red[0];
}
int main(){
int *h_d, *d_d, *h_r, *d_r;
h_d = new int[N];
h_r = new int[1];
cudaMalloc(&d_d, N*sizeof(d_d[0]));
cudaMalloc(&d_r, sizeof(d_r[0]));
for (int i = 0; i < N; i++) h_d[i] = 0;
h_d[44] = 1;
cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
k<<<1, nTPB>>>(d_d, d_r);
cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$
答案 1 :(得分:0)
我不明白这里是如何发生银行冲突的,您正在一个线程中完成所有工作。我的解决方案是跨越块大小并在找到索引的情况下设置标志。
__shared__ int arr[N];
__shared__ bool index_found = false;
__shared__ int current_found[128]; //assuming block size is 128. Assuming this is initialized to all falses.
int block_size = blockDim.x;
int thread_id = threadIdx.x;
for(int i = thread_id; (i < N) && !index_found ; i += block_size){ //stride by block_size
if(arr[i] && (i >= a)){
current_found[thread_id] = i; // this only marks, at which iteration the value is found. But doesn't guarantee that there is a single index found. We need to find the absolute solution among possible indices.
index_found = true;
}
__syncthreads(); // make sure all threads wait for shared writes
}
if(thread_id ==0){ // in master thread, find the index among possible values
//mandatory sequential part
for(int i = 0; i < 128 ; i++){
if(current_found[i]){
// i is your index. Get your index here the way you want.
break;
}
}
}
代码未经测试,无法在网络编辑器中编写。