为什么此Radix排序CUDA代码仅排序32个元素?

时间:2018-07-31 07:04:42

标签: visual-studio sorting cuda gpu radix-sort

当我在Visual Studio 15中执行此代码时,它仅对32个元素进行排序。如果我将WSIZE设置为大于32或小于32,它将显示与输出相同的未排序元素。谁能帮我吗?

我的系统信息。

处理器-Intel(R)i5-6200U CPU @ 2.30GHz,2400 Mhz,2个Core,4个逻辑处理器

RAM-8GB

专用显卡-NVIDIA GeForce 940M 4GB(384个CUDA内核)

这是WSIZE的输出,设置为32 https://i.imgur.com/geQ9VUZ.jpg

这是WSIZE的输出,设置为19 https://i.imgur.com/4K0xkep.jpg

这是WSIZE的输出,设置为50 https://i.imgur.com/M8irQhs.jpg

#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

#define WSIZE 32 /*to set No. of elements to sort and also set No. of threads*/
#define LOOPS 2
#define UPPER_BIT 4
#define LOWER_BIT 0

__device__ unsigned int ddata[WSIZE];
__device__ int ddata_s[WSIZE];

__global__ void parallelRadix()
{
    // This data in shared memory
    __shared__ volatile unsigned int sdata[WSIZE];

    // Load from global into shared variable
    sdata[threadIdx.x] = ddata[threadIdx.x];

    unsigned int bitmask = 1;
    unsigned int offset = 0;
    // -1, -2, -4, -8, -16, -32, -64, -128, -256,...
    unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
    unsigned int mypos;

    // For each LSB to MSB
    for (int i = LOWER_BIT; i < UPPER_BIT; i++)
    {
        unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset];
        unsigned int mybit = mydata&bitmask;
        // Get population of ones and zeroes
        unsigned int ones = __ballot(mybit);
        unsigned int zeroes = ~ones;
        // Switch ping-pong buffers
        offset ^= WSIZE;

        // Do zeroes, then ones
        if (!mybit)
        {
            mypos = __popc(zeroes&thrmask);
        }
        else {      // Threads with a one bit
                    // Get my position in ping-pong buffer
            mypos = __popc(zeroes) + __popc(ones&thrmask);
        }

        // Move to buffer  (or use shfl for cc 3.0)
        sdata[mypos - 1 + offset] = mydata;
        // Repeat for next bit
        bitmask <<= 1;
    }
    // Put results to global
    ddata[threadIdx.x] = sdata[threadIdx.x + offset];
}

int main() {

    /* Parallel Radix Sort */

    unsigned int hdata[WSIZE];
    float totalTime = 0;
    LARGE_INTEGER cicles;
    for (int lcount = 0; lcount < LOOPS; lcount++)
    {
        //srand(time(NULL));
        // Array elements have value in range of 1024
        unsigned int range = 1U << UPPER_BIT;

        // Fill array with random elements
        // Range = 1024
        //srand(time(0));
        QueryPerformanceCounter(&cicles);
        srand(cicles.QuadPart);
        printf("\n input array  %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
        {
            hdata[i] = rand() % range;
            printf("%u ", hdata[i]);

        }


        // Copy data from host to device
        cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int));

        // Execution time measurement, that point starts the clock
        high_resolution_clock::time_point t1 = high_resolution_clock::now();
        parallelRadix << < 1, WSIZE >> >();
        // Make kernel function synchronous
        cudaDeviceSynchronize();
        // Execution time measurement, that point stops the clock
        high_resolution_clock::time_point t2 = high_resolution_clock::now();

        // Execution time measurement, that is the result
        auto duration = duration_cast<milliseconds>(t2 - t1).count();

        // Summination of each loops' execution time
        totalTime += (float)duration / 1000.00;

        // Copy data from device to host
        cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int));

        printf("\n sorted array %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
            printf("%u ", hdata[i]);


    }
    printf("\n\n");
    printf("Parallel Radix Sort:\n");
    printf("Array size = %d\n", WSIZE * LOOPS);
    printf("Time elapsed = %fseconds\n", totalTime);

}

1 个答案:

答案 0 :(得分:4)

该代码来自here。我写的。

它仅设计用于对32个元素进行排序,因为它被设计为只能在32个线程的CUDA中使用单个扭曲。它从来没有设计过或打算对超过32个元素进行排序,并且它在经纱级别使用的许多技巧都必须重写才能在大于经纱规模下工作。 WSIZE代表WARP SIZE,在CUDA中,经线的大小为32个线程。您无法更改经线大小。

如果要对更大的数据集进行排序,请使用thrustcub中的操作。