Question

假设我有一个8个数字的数组A，我有另一个数字B，以确定A中的数字应该向右移动多少个位置

3,6,7,8,1,2,3,5

B 0,1,0,0,0,0,0,0

0表示有效，1表示此数字应为1位后，输出数组应在3之后插入0，输出数组C应为：

C：3,0,6,7,8,1,2,3

是否插入0或其他内容并不重要，重点是3之后的所有数字都移动了一个位置。出站号码将不再出现在阵列中。

另一个例子：

3,6,7,8,1,2,3,5

B 0,1,0,0,2,0,0,0

C 3,0,6,7,8,0,1,2

.......................................

3,6,7,8,1,2,3,5

B 0,1,0,0,1,0,0,0

C 3,0,6,7,8,1,2,3

我正在考虑使用scan / prefix-sum或类似的东西来解决这个问题。此数组也很小，我应该能够将数组放入一个warp（＆lt; 32数字）中并使用shuffle指令。有人有想法吗？

Answer 1

一种可能的方法。

由于您的转变含糊不清（例如0, 1, 0, 1，0, 1, 1, 1和0, 1, 0 ,0都产生相同的数据偏移模式），因此无法创建移位模式的前缀和，以产生每个位置的相对偏移。然而，我们可以观察到，如果换档模式中的每个零都被左边的第一个非零移位值替换，则会创建一个有效的偏移模式：

0, 1, 0, 0   (shift pattern)
0, 1, 1, 1   (offset pattern)

或

0, 2, 0, 2   (shift pattern)
0, 2, 2, 2   (offset pattern)

那怎么办呢？让我们假设我们有第二个测试用例转换模式：

      0, 1, 0, 0, 2, 0, 0, 0

我们想要的偏移模式是：

      0, 1, 1, 1, 2, 2, 2, 2

对于给定的移位模式，创建二进制值，其中如果移位模式中相应索引处的值为零，则每个位为1，否则为零。我们可以使用warp vote指令，称为__ballot()。每条车道将从选票中获得相同的价值：
```
  1  0  1  1  0  1  1  1  (this is a single binary 8-bit value in this case)
```
现在每个经线车道都会取这个值，并为它添加一个值，该值在经线车道位置有1位。在示例的其余部分使用第1道：
```
+ 0  0  0  0  0  0  1  0  (the only 1 bit in this value will be at the lane index)
= 1  0  1  1  1  0  0  1
```
我们现在采用步骤2的结果，并按步骤1的结果进行按位异或运算：
```
= 0  0  0  0  1  1  1  0
```
我们现在计算此值中的1位数（此处有__popc() intrinsic），并从结果中减去1。因此，对于上面的第1道示例，此步骤的结果将为2，因为设置了3位。这使得使用到我们左边的第一个值的距离在原始换档模式中是非零的。因此，对于车道1示例，车道1左侧的第一个非零值是2车道更高，即车道3。
对于每个泳道，我们使用步骤4的结果来获取该泳道的适当偏移值。我们可以使用__shfl_down() warp shuffle指令一次处理所有通道。
```
  0, 1, 1, 1, 2, 2, 2, 2
```
从而产生我们想要的＆＃34;偏移模式＆＃34;。

一旦我们得到了所需的偏移模式，让每个warp lane使用其偏移值来适当地移动其数据项的过程非常简单。

这是一个完整的示例，使用您的3个测试用例。上面的步骤1-4包含在__device__函数mydelta中。内核的其余部分正在执行步骤5 shuffle，适当地索引到数据中，并复制数据。由于使用了warp shuffle指令，我们必须为cc3.0或更高版本的GPU编译它。（但是，将warp shuffle指令替换为允许在cc2.0或更高版本的设备上运行的其他索引代码并不困难。）此外，由于使用了各种内在函数，此函数不能用于超过32个数据项，但这是你问题中陈述的先决条件。

$ cat t475.cu
#include <stdio.h>
#define DSIZE 8

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__device__ int mydelta(const int shift){
  unsigned nz = __ballot(shift == 0);
  unsigned mylane = (threadIdx.x & 31);
  unsigned lanebit = 1<<mylane;
  unsigned temp = nz + lanebit;
  temp = nz ^ temp;
  unsigned delta = __popc(temp);
  return delta-1;
}
__global__ void mykernel(const int *data, const unsigned *shift, int *result, const int limit){ // limit <= 32
  if (threadIdx.x < limit){
    unsigned lshift = shift[(limit - 1) - threadIdx.x];
    unsigned delta = mydelta(lshift);
    unsigned myshift = __shfl_down(lshift, delta);
    myshift = __shfl(myshift, ((limit -1) - threadIdx.x)); // reverse offset pattern
    result[threadIdx.x] = 0;
    if ((myshift + threadIdx.x) < limit)
    result[threadIdx.x + myshift] = data[threadIdx.x];
  }
}

int main(){
  int A[DSIZE]         = {3, 6, 7, 8, 1, 2, 3, 5};
  unsigned tc1B[DSIZE] = {0, 1, 0, 0, 0, 0, 0, 0};
  unsigned tc2B[DSIZE] = {0, 1, 0, 0, 2, 0, 0, 0};
  unsigned tc3B[DSIZE] = {0, 1, 0, 0, 1, 0, 0, 0};

  int *d_data, *d_result, *h_result;
  unsigned *d_shift;
  h_result = (int *)malloc(DSIZE*sizeof(int));
  if (h_result == NULL) { printf("malloc fail\n"); return 1;}
  cudaMalloc(&d_data, DSIZE*sizeof(int));
  cudaMalloc(&d_shift, DSIZE*sizeof(unsigned));
  cudaMalloc(&d_result, DSIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  cudaMemcpy(d_data, A, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_shift, tc1B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("index: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", i);
  printf("\nA:     ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", A[i]);
  printf("\ntc1 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc1B[i]);
  printf("\ntc1 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  cudaMemcpy(d_shift, tc2B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("\ntc2 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc2B[i]);
  printf("\ntc2 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  cudaMemcpy(d_shift, tc3B, DSIZE*sizeof(unsigned), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMempcyH2D fail");
  mykernel<<<1,32>>>(d_data, d_shift, d_result, DSIZE);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  cudaMemcpy(h_result, d_result, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMempcyD2H fail");
  printf("\ntc3 B: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", tc3B[i]);
  printf("\ntc2 C: ");
  for (int i = 0; i < DSIZE; i++)
    printf("%d, ", h_result[i]);
  printf("\n");
  return 0;
}
$ nvcc -arch=sm_35 -o t475 t475.cu
$ ./t475
index: 0, 1, 2, 3, 4, 5, 6, 7,
A:     3, 6, 7, 8, 1, 2, 3, 5,
tc1 B: 0, 1, 0, 0, 0, 0, 0, 0,
tc1 C: 3, 0, 6, 7, 8, 1, 2, 3,
tc2 B: 0, 1, 0, 0, 2, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 0, 1, 2,
tc3 B: 0, 1, 0, 0, 1, 0, 0, 0,
tc2 C: 3, 0, 6, 7, 8, 1, 2, 3,
$

执行小插入/移位的并行算法

1 个答案: