Question

我在CUDA上为一个更大的项目制作了一个排序算法，我决定实施一个Bitonic排序。我将要排序的元素数量总是2的幂，实际上将是512.我需要一个具有最终位置的数组，因为这个方法将用于排序代表质量矩阵的数组另一个解决方案。

适应性是数组i，排序，numElements是元素的数量，orden最初是一个带有numElements位置的空数组，将以这种方式在最开始填充：orden[i]=i。实际上，orden与此问题无关，但我保留了它。

我的问题是某些值没有正确排序，直到现在我还无法弄清楚我有什么问题。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <ctime>
#include <cuda.h>
#include <curand.h>
#include <curand_kernel.h>
#include <device_functions.h>
#include "float.h"


__global__ void sorting(int * orden, float * fitness, int numElements);

// Populating array with random values for testing purposes
__global__ void populate( curandState * state, float * fitness{

    curandState localState = state[threadIdx.x];
    int a = curand(&localState) % 500;
    fitness[threadIdx.x] = a;
}

//Curand setup for the populate method 
__global__ void setup_cuRand(curandState * state, unsigned long seed)
{
    int id = threadIdx.x;
    curand_init(seed, id, 0, &state[id]);
}

int main()
{
    float * arrayx;
    int numelements = 512;
    int * orden;
    float arrayCPU[512] = { 0 };
    curandState * state;

    cudaDeviceReset();
    cudaSetDevice(0);
    cudaMalloc(&state, numelements * sizeof(curandState));
    cudaMalloc((void **)&arrayx, numelements*sizeof(float));
    cudaMalloc((void **)&orden, numelements*sizeof(int));






    setup_cuRand << <1, numelements >> >(state, unsigned(time(NULL)));

    populate << <1, numelements >> > (state, arrayx);
    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < numelements; i++)
        printf("fitness[%i] = %f\n", i, arrayCPU[i]);

    sorting << <1, numelements >> >(orden, arrayx, numelements);
    printf("\n\n");

    cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < numelements; i++)
        printf("fitness[%i] = %f\n", i, arrayCPU[i]);



    cudaDeviceReset();


    return 0;
}
__device__ bool isValid(float n){
    return !(isnan(n) || isinf(n) || n != n || n <= FLT_MIN || n >= FLT_MAX);

}

__global__ void sorting(int * orden, float * fitness, int numElements){
    int i = 0;
    int j = 0;
    float f = 0.0;
    int aux = 0;

    //initial orden registered (1, 2, 3...)
    orden[threadIdx.x] = threadIdx.x;
    //Logarithm on base 2 of numElements
    for (i = 2; i <= numElements; i = i * 2){
        // descending from i reducing to half each iteration
        for (j = i; j >= 2; j = j / 2){

            if (threadIdx.x % j  < j / 2){
                __syncthreads();
                // ascending or descending consideration using (threadIdx.x % (i*2) < i) 
                if ((threadIdx.x % (i * 2) < i) && (fitness[threadIdx.x] >  fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x])) ||
                    ((threadIdx.x % (i * 2) >= i) && (fitness[threadIdx.x] <= fitness[threadIdx.x + j / 2] || !isValid(fitness[threadIdx.x + j / 2])))){

                    aux = orden[threadIdx.x];
                    orden[threadIdx.x] = orden[threadIdx.x + j / 2];
                    orden[threadIdx.x + j / 2] = aux;
                    //Se reubican los fitness
                    f = fitness[threadIdx.x];
                    fitness[threadIdx.x] = fitness[threadIdx.x + j / 2];
                    fitness[threadIdx.x + j / 2] = f;
                }
            }
        }
    }
}

例如，我随机执行输出：

A random execution

这是我的比特排序的代表：

Bitonic sorting Schema，箭头指向比较的最差值

Answer 1

以下是我发现的问题：

在您发布的代码中，这不会编译：

__global__ void populate( curandState * state, float * fitness{
                                                              ^
                                                   missing close parenthesis

我在那里添加了一个紧密的括号。

没有必要在这些cudaMemcpy语句中获取数组的地址：
```
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
....
cudaMemcpy(&arrayCPU, arrayx, numelements * sizeof(float), cudaMemcpyDeviceToHost);
```
数组名称已经是数组的地址，所以我删除了＆符号。如果使用动态分配的数组，则会破坏此类用法。
您在__syncthreads()的使用已被破坏：
```
for (j = i; j >= 2; j = j / 2){

    if (threadIdx.x % j  < j / 2){
        __syncthreads();
```
除非条件语句在整个线程块中统一评估，否则条件语句中__syncthreads()的使用通常是不正确的。 documentation中介绍了这一点。我们可以通过微小的改变达到预期的效果：
```
for (j = i; j >= 2; j = j / 2){
    __syncthreads();
    if (threadIdx.x % j  < j / 2){
```

通过上述更改，对于大多数情况，您的代码似乎正常运行。如果您打算正确排序0（或任何负值），您在有效性检查中对FLT_MIN的使用也是有问题的。一般来说，FLT_MIN是一个very small, close to zero的数字。如果你认为这是一个很大的负数，那就不是。因此，零是随机数生成器的可能输出，并且不会正确排序。我会把这个留给你解决，它应该是直截了当的，但这将取决于你最终想要实现的目标。（如果您只想排序正的非零浮点值，则测试可能没问题，但在这种情况下，您的随机数生成器可以返回0.。

在cuda中进行的比特排序错误地排列了一些值

1 个答案: