Question

共享内存中有一个长度为N的数组。数组稀疏（有很多零元素）。

__shared__ int arr[N];

目标是该块中的所有线程（例如128个线程）都应找到第一个索引j，其中N> j> = a并且arr [j]为非零。显而易见的方法是：

for(int i = a; i < N; i++){
  if(!arr[i]){
    j = i;
    break;
  }
}

但是，此方法不是并行的（线程不配合），并且存在严重的库冲突。我想知道是否有更有效的方法。

Answer 1

这是一个parallel reduction问题。您基本上可以通过适当的谓词测试找到最小值。我将编写一些代码进行演示。为了简单起见，我将做一些简化的假设，例如我们想做一个简单的共享内存减少（相对于warp-shuffle）：

const int nTPB = 128;
// assume nTPB, number of threads per block, is a power of 2
...
__shared__ int arr[N];
__shared__ int red[nTPB];
// code which populates arr ...
// ...
red[threadIdx.x] =  N;
__syncthreads();
// perform test
for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
  int t1 = arr[ridx]?ridx:N;
  red[threadIdx.x] = min(t1, red[threadIdx.x]);}
__syncthreads();
// standard min-finding sweep reduction in shared memory
for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
  if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
  __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0

这是一个快速的测试案例：

$ cat t1640.cu
#include <iostream>

const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
  __shared__ int arr[N];
  __shared__ int red[nTPB];
// code which populates arr ...
  for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
    arr[ridx] = d[ridx];
  red[threadIdx.x] =  N;
  __syncthreads();
// perform test
  for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
    int t1 = arr[ridx]?ridx:N;
    red[threadIdx.x] = min(t1, red[threadIdx.x]);}
  __syncthreads();
// standard min-finding sweep reduction in shared memory
  for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
    if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
    __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
  if (!threadIdx.x) *r = red[0];
}

int main(){

  int *h_d, *d_d, *h_r, *d_r;
  h_d = new int[N];
  h_r = new int[1];
  cudaMalloc(&d_d, N*sizeof(d_d[0]));
  cudaMalloc(&d_r, sizeof(d_r[0]));
  for (int i = 0; i < N; i++) h_d[i] = 0;
  h_d[44] = 1;
  cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
  k<<<1, nTPB>>>(d_d, d_r);
  cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
  std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$

Answer 2

我不明白这里是如何发生银行冲突的，您正在一个线程中完成所有工作。我的解决方案是跨越块大小并在找到索引的情况下设置标志。

__shared__ int arr[N];
__shared__ bool index_found = false;
__shared__ int current_found[128];  //assuming block size is 128. Assuming this is initialized to all falses.
int block_size = blockDim.x;
int thread_id = threadIdx.x;
for(int i = thread_id; (i < N) && !index_found ; i += block_size){ //stride by block_size
    if(arr[i] && (i >= a)){
        current_found[thread_id] = i;  // this only marks, at which iteration the value is found. But doesn't guarantee that there is a single index found. We need to find the absolute solution among possible indices.
        index_found = true;
    }
    __syncthreads();  // make sure all threads wait for shared writes
}
if(thread_id ==0){  // in master thread, find the index among possible values
    //mandatory sequential part
    for(int i = 0; i < 128 ; i++){
        if(current_found[i]){
            // i is your index. Get your index here the way you want.
            break;
        }
    }
}

代码未经测试，无法在网络编辑器中编写。

在CUDA中查找第一个非零元素

2 个答案: