我一直在努力使用CUDA开发一个基数选择,它利用k个最小元素来对给定数量的元素进行排序。这个基数选择背后的主要思想是扫描从MSB到LSB的32位整数。它将左侧的所有0位和右侧的所有1位分区。包含k个最小元素的一侧递归求解。我的分区过程工作正常但我在处理递归函数调用时遇到问题。我无法阻止递归。请帮助我! 我的内核函数如下所示:这是kernel.h
#include "header.h"
#define WARP_SIZE 32
#define BLOCK_SIZE 32
__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N / WARP_SIZE;
int pivot;
__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];
for(int i = 0; i < NumWarps; i++)
{
if(LocWarpID >= firstidx && LocWarpID <=lastidx)
{
int r = d_DataIn[i * WARP_SIZE + LocWarpID];
int p = (r>>(31-bit))&1;
unsigned int B = __ballot(p);
unsigned int B_flip = ~B;
if(p==1)
{
int b = B << (32-LocWarpID);
int RightLoc = __popc(b);
DataPartition[lastidx - RightLoc] = r;
}
else
{
int b_flip = B_flip << (32 - LocWarpID);
int LeftLoc = __popc(b_flip);
DataPartition[LeftLoc] = r;
}
if(LocWarpID <= lastidx - __popc(B))
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
else
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
pivot = lastidx - __popc(B);
return pivot+1;
}
}
}
__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
if(firstidx == lastidx)
return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
return *d_DataIn;
else if(k < length)
return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}
__global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}
主机代码是main.cu,它看起来像:
#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"
#define BLOCK_SIZE 32
using namespace std;
int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(), RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];
int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N / BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);
WriteData(h_DataOut1, h_DataOut, 10, N);
return 0;
}
我使用的标题列表:
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/generate.h>
#include "functor.h"
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
另一个头文件“functor.h”,用于将浮点数转换为int类型并生成随机浮点数。
#include <thrust/random.h>
#include <sstream>
#include <fstream>
#include <iomanip>
struct RandomFloatNumbers
{
float a, b;
__host__ __device__
RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};
__host__ __device__
float operator() (const unsigned int n) const{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a,b);
rng.discard(n);
return dist(rng);
}
};
struct FloatToInt
{
__host__ __device__
int operator() (const float &x)
const {
union {
float f_value;
int i_value;
} value;
value.f_value = x;
return value.i_value;
}
};
float IntToFloat(int &x)
{
union{
float f_value;
int i_value;
}value;
value.i_value = x;
return value.f_value;
}
bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
{
std::ofstream data;
std::stringstream file;
file << "out\\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() == false)
{
std::cout << "File is not open" << std::endl;
return false;
}
for(int i = 0; i < N; i++)
{
h_DataOut1[i] = IntToFloat(h_DataOut[i]);
//cout << h_HostFloat[i] << " \t" << h_DataOut1[i] << endl;
//std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
//data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl;
data << h_DataOut1[i] << std::endl;
}
data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"\n";
return true;
}
答案 0 :(得分:1)
根据您的要求,我发布了用于调查此问题的代码,并帮助我研究您的代码。
#include <stdio.h>
#include <stdlib.h>
__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){
int one = 0;
int valid = 0;
int my_one, my_zero;
if (partition[idx]){
valid = 1;
if(data[idx] & (1ULL<<(31-bit))) one=1;}
__syncthreads();
if (valid){
if (one){
my_one=1;
my_zero=0;}
else{
my_one=0;
my_zero=1;}
}
else{
my_one=0;
my_zero=0;}
ones[idx]=my_one;
zeroes[idx]=my_zero;
unsigned int warp_one = __popc(__ballot(my_one));
if (!(threadIdx.x & 31))
warp_ones[threadIdx.x>>5] = warp_one;
__syncthreads();
// reduce
for (int i = 16; i > 0; i>>=1){
if (threadIdx.x < i)
warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i];
__syncthreads();}
return warp_ones[0];
}
__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){
__shared__ unsigned int loc_data[1024];
__shared__ unsigned int loc_ones[1024];
__shared__ unsigned int loc_zeroes[1024];
__shared__ unsigned int loc_warp_ones[32];
int l=0;
int bit = 0;
unsigned int u = n;
if (n<2){
if ((n == 1) && !(threadIdx.x)) *result = data[0];
return;}
loc_data[threadIdx.x] = data[threadIdx.x];
loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0;
__syncthreads();
unsigned int *next = loc_ones;
do {
int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones);
if ((u-s) > m){
u = (u-s);
next = loc_zeroes;}
else{
l = (u-s);
next = loc_ones;}}
while ((u != l) && (bit<32));
if (next[threadIdx.x]) *result = loc_data[threadIdx.x];
}
int partition(unsigned int *data, int l, int u, int bit){
unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int));
int pos = 0;
for (int i = l; i<=u; i++)
if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i];
int result = u-pos;
for (int i = l; i<=u; i++)
if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i];
pos = 0;
for (int i = u; i>=l; i--)
data[i] = temp[pos++];
free(temp);
return result;
}
unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){
if (l == u) return(data[l]);
if (bit > 32) {printf("radixselect fail!\n"); return 0;}
int s = partition(data, l, u, bit);
if (s>=m) return radixselect(data, l, s, m, bit+1);
return radixselect(data, s+1, u, m, bit+1);
}
int main(){
unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7};
unsigned int data1[8];
for (int i = 0; i<8; i++){
for (int j=0; j<8; j++) data1[j] = data[j];
printf("value[%d] = %d\n", i, radixselect(data1, 0, 7, i, 0));}
unsigned int *d_data;
cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int));
unsigned int h_result, *d_result;
cudaMalloc((void **)&d_result, sizeof(unsigned int));
cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 8; i++){
gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("gpu result index %d = %d\n", i, h_result);
}
unsigned int data2[1024];
unsigned int data3[1024];
for (int i = 0; i < 1024; i++) data2[i] = rand();
cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice);
for (int i = 0; i < 1024; i++){
for (int j = 0; j<1024; j++) data3[j] = data2[j];
unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0);
gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result);
cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost);
if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %d\n", i, cpuresult, h_result); return 1;}
}
printf("Finished\n");
return 0;
}
以下是一些注释,没有特别的顺序:
float
到int
很好奇。我没有想到通过在指数位序列后跟一系列尾数位来尝试按位基数选择的分支。它可能有用,(虽然我认为如果你包含符号位,它肯定不会起作用)但我再也不认为它是理解算法的核心。