Question

我一直在努力使用CUDA开发一个基数选择，它利用k个最小元素来对给定数量的元素进行排序。这个基数选择背后的主要思想是扫描从MSB到LSB的32位整数。它将左侧的所有0位和右侧的所有1位分区。包含k个最小元素的一侧递归求解。我的分区过程工作正常但我在处理递归函数调用时遇到问题。我无法阻止递归。请帮助我！我的内核函数如下所示：这是kernel.h

  #include "header.h"
  #define WARP_SIZE 32
  #define BLOCK_SIZE 32

__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N / WARP_SIZE;
int pivot;

__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];

for(int i = 0; i < NumWarps; i++)
{
    if(LocWarpID >= firstidx && LocWarpID <=lastidx)
    {
        int r = d_DataIn[i * WARP_SIZE + LocWarpID];
        int p = (r>>(31-bit))&1;
        unsigned int B = __ballot(p);
        unsigned int B_flip = ~B;
        if(p==1)
        {
            int b = B << (32-LocWarpID);
            int RightLoc = __popc(b);
            DataPartition[lastidx - RightLoc] = r;
        }
        else
        {
            int b_flip = B_flip << (32 - LocWarpID);
            int LeftLoc = __popc(b_flip);
            DataPartition[LeftLoc] = r;
        }

    if(LocWarpID <= lastidx - __popc(B))
        {

            d_DataIn[LocWarpID] = DataPartition[LocWarpID];
        }
    else
        {

            d_DataIn[LocWarpID] = DataPartition[LocWarpID];
        }
        pivot = lastidx - __popc(B); 
        return pivot+1;
    }

}   
}


__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{

if(firstidx == lastidx)
    return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
    return *d_DataIn; 
else if(k < length)
    return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
    return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}

 __global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}

主机代码是main.cu，它看起来像：

#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"

#define BLOCK_SIZE 32
using namespace std;

int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(),         RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);   
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];

int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N / BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);  
WriteData(h_DataOut1, h_DataOut, 10, N);


return 0;
   }

我使用的标题列表：

   #include "cuda.h"
   #include "cuda_runtime_api.h"
   #include "device_launch_parameters.h"
   #include <thrust/host_vector.h>
   #include <thrust/device_vector.h>
   #include <thrust/transform.h>
   #include <thrust/generate.h>
   #include "functor.h"
   #include <thrust/iterator/counting_iterator.h>
   #include <thrust/copy.h>
   #include <thrust/device_ptr.h>

另一个头文件“functor.h”，用于将浮点数转换为int类型并生成随机浮点数。

  #include <thrust/random.h>
  #include <sstream>
  #include <fstream>
  #include <iomanip>
  struct RandomFloatNumbers
   {
float a, b;
__host__ __device__
    RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};  
__host__ __device__
    float operator() (const unsigned int n) const{
        thrust::default_random_engine rng;
        thrust::uniform_real_distribution<float> dist(a,b);
        rng.discard(n);
        return dist(rng);
}

    };

   struct FloatToInt
    {
__host__ __device__
    int operator() (const float &x)
const {
    union {
        float f_value;
        int i_value;
    } value;

    value.f_value = x;
    return value.i_value;
}
    };

   float IntToFloat(int &x)
     {
    union{
          float f_value;
          int i_value;
        }value;

        value.i_value = x;
        return value.f_value;
     }

   bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
    {
std::ofstream data;
std::stringstream file;
file << "out\\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() ==  false)
{
    std::cout << "File is not open" << std::endl;
    return false;
}

    for(int i = 0; i < N; i++)
    {

        h_DataOut1[i] = IntToFloat(h_DataOut[i]);
        //cout << h_HostFloat[i] << " \t" <<  h_DataOut1[i] << endl;
        //std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
        //data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl;
        data << h_DataOut1[i] << std::endl;
    }
    data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"\n"; 
return true;
    }

Answer 1

根据您的要求，我发布了用于调查此问题的代码，并帮助我研究您的代码。

#include <stdio.h>
#include <stdlib.h>

__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){
  int one = 0;
  int valid = 0;
  int my_one, my_zero;
  if (partition[idx]){
    valid = 1;
    if(data[idx] & (1ULL<<(31-bit))) one=1;}
  __syncthreads();
  if (valid){
    if (one){
      my_one=1;
      my_zero=0;}
    else{
      my_one=0;
      my_zero=1;}
    }
  else{
    my_one=0;
    my_zero=0;}
  ones[idx]=my_one;
  zeroes[idx]=my_zero;
  unsigned int warp_one = __popc(__ballot(my_one));
  if (!(threadIdx.x & 31))
    warp_ones[threadIdx.x>>5] = warp_one;
  __syncthreads();
  // reduce
  for (int i = 16; i > 0; i>>=1){
    if (threadIdx.x < i)
      warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i];
    __syncthreads();}
  return warp_ones[0];
}

__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){
  __shared__ unsigned int loc_data[1024];
  __shared__ unsigned int loc_ones[1024];
  __shared__ unsigned int loc_zeroes[1024];
  __shared__ unsigned int loc_warp_ones[32];
  int l=0;
  int bit = 0;
  unsigned int u = n;
  if (n<2){
    if ((n == 1) && !(threadIdx.x)) *result = data[0];
    return;}
  loc_data[threadIdx.x] = data[threadIdx.x];
  loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0;
  __syncthreads();
  unsigned int *next = loc_ones;
  do {
    int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones);
    if ((u-s) > m){
      u = (u-s);
      next = loc_zeroes;}
    else{
      l = (u-s);
      next = loc_ones;}}
    while ((u != l) && (bit<32));
  if (next[threadIdx.x]) *result = loc_data[threadIdx.x];
}
int partition(unsigned int *data, int l, int u, int bit){
  unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int));
  int pos = 0;
  for (int i = l; i<=u; i++)
    if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i];
  int result = u-pos;
  for (int i = l; i<=u; i++)
    if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i];
  pos = 0;
  for (int i = u; i>=l; i--)
    data[i] = temp[pos++];
  free(temp);
  return result;
}

unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){

  if (l == u) return(data[l]);
  if (bit > 32) {printf("radixselect fail!\n"); return 0;}
  int s = partition(data, l, u, bit);
  if (s>=m) return radixselect(data, l, s, m, bit+1);
  return radixselect(data, s+1, u, m, bit+1);
}

int main(){

  unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7};
  unsigned int data1[8];

  for (int i = 0; i<8; i++){
    for (int j=0; j<8; j++) data1[j] = data[j];
    printf("value[%d] = %d\n", i, radixselect(data1, 0, 7, i, 0));}

  unsigned int *d_data;
  cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int));
  unsigned int h_result, *d_result;
  cudaMalloc((void **)&d_result, sizeof(unsigned int));
  cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice);
  for (int i = 0; i < 8; i++){
    gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result);
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
    printf("gpu result index %d = %d\n", i, h_result);
    }

  unsigned int data2[1024];
  unsigned int data3[1024];

  for (int i = 0; i < 1024; i++) data2[i] = rand();
  cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice);
  for (int i = 0; i < 1024; i++){
    for (int j = 0; j<1024; j++) data3[j] = data2[j];
    unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0);
    gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result);
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
    if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %d\n", i, cpuresult, h_result); return 1;}
    }
  printf("Finished\n");

  return 0;
}

以下是一些注释，没有特别的顺序：

我摆脱了所有的推力代码，就基数选择算法而言，它没有做任何有用的事情。我也发现你的float到int很好奇。我没有想到通过在指数位序列后跟一系列尾数位来尝试按位基数选择的分支。它可能有用，（虽然我认为如果你包含符号位，它肯定不会起作用）但我再也不认为它是理解算法的核心。
我添加了一个主机版本，我只是为了检查我的设备结果。
我很确定在有重复元素的情况下，此算法会失败。例如，如果你把它全部为零的矢量，我认为它会失败。但我认为处理这种情况并不困难。
我的主机版本是递归的，但我的设备版本不是。我没有看到递归在这里是有用的，因为算法的非递归形式也很容易编写，特别是因为最多有32位通过。尽管如此，如果你想创建一个递归设备版本，通过在分区函数中加入u，s和l操作代码应该不难。
我放弃了typical cuda error checking。不过我推荐它。
我不认为这是cuda编程的典范。如果您深入研究基数排序算法（例如here），您会发现它非常复杂。快速GPU基数选择看起来与我的代码完全不同。我编写的代码类似于串行递归分区基数排序，这不是在大规模并行架构上执行此操作的最佳方法。
由于基数选择不是一种排序，我试图编写一个不会对输入数据进行数据移动的设备代码，因为我认为这是昂贵且不必要的。我在内核开始时对全局内存中的数据进行了一次读取，然后我完成了共享内存，即使在共享内存中，我也没有重新安排数据（就像我在主机版本中那样））以避免数据移动的成本。相反，我保留1和0分区的标志数组，以提供给下一个分区步骤。数据移动将涉及相当数量的未合并和/或银行冲突的流量，而标志阵列允许所有访问都是非银行冲突的。

radix select使用cuda

1 个答案: