Question

我在文件上有几个数字列表。例如，

.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434

现在，我想获得使用GPU发生每个数字的次数。我相信这在GPU上比在CPU上更快，因为每个线程可以处理一个列表。我应该在GPU上使用什么数据结构来轻松获得上述计数。例如，对于上述内容，答案如下：

.333 = 2 times in entire file
.324 = 1 time

等。

我正在寻找一般解决方案。不仅适用于具有特定计算能力的设备

只需编写Pavan建议的内核，看看我是否有效地实现了它：

int uniqueEle = newend.valiter – d_A;

int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);

__global__ void launch(int *i, int* count, int n){
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ int indexes[256];
    if(id < n ){
        indexes[threadIdx.x] = i[id];
        //as occurs between two blocks
        if(id % 255 == 0){
            count[indexes] = i[id+1] - i[id];
        }
    }
    __syncthreads();
    if(id < ele - 1){
        if(threadIdx.x < 255)
            count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];

    }
}

问题：如何修改此内核以便它处理任意大小的数组。即，当线程总数<1时处理条件。元素数量

Answer 1

以下是我在matlab中执行代码的方法

A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A);   % Find unique values and their locations
counts = diff([0, locations]);     % Find the count based on their locations

在普通cuda中没有简单的方法可以做到这一点，但您可以使用现有的库来执行此操作。

1）Thrust

它还附带CUDA 4.0的CUDA工具包。

通过使用以下函数，可以将matlab代码粗略地转换为推力。我不太精通推力，但我只是想让你知道要看什么样的惯例。

float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I; 
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together    
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);

现在，您拥有每个唯一值的第一个实例的位置。您现在可以启动内核以查找d_Locations中从0到new_end的相邻元素之间的差异。从num中减去最终值以获得最终位置的计数。

编辑（添加通过聊天提供的代码）

以下是需要完成差异代码的方法

#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))

int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));

int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);

kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);

__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];

if (id < n) indexes[tx] = i[id];
__syncthreads();

if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}

if (id == n - 1) count[id] = n - indexes[tx];
return;
}

2）ArrayFire

这是一个易于使用的 free 基于数组的库。

您可以在ArrayFire中执行以下操作。

using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value. 
array counts = diff1(join(locations + 1, num));

披露：我为AccelerEyes工作，开发此软件。

Answer 2

要回答这个问题的最新问题 - 完成thrust方法proposed by Pavan的差异内核可能看起来像这样：

template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) { 
    int id = blockDim.x * blockIdx.x + threadIdx.x; 
    int strd = blockDim.x * gridDim.x;
    int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));

    __shared__ int indices[blcksz+1]; 

    for(; id<nmax; id+=strd) {
        // Data load
        indices[threadIdx.x] = (id < n) ? i[id] : n; 
        if (threadIdx.x == (blcksz-1)) 
            indices[blcksz] = ((id+1) < n) ? i[id+1] : n; 

        __syncthreads(); 

        // Differencing calculation
        int diff = indices[threadIdx.x+1] - indices[threadIdx.x];

        // Store
        if (id < n) count[id] = diff;

        __syncthreads(); 
    }
}

Answer 3

这是一个解决方案：

__global__ void counter(float* a, int* b, int N)
{
    int idx = blockIdx.x*blockDim.x+threadIdx.x;

    if(idx < N)
    {
        float my = a[idx];
        int count = 0;
        for(int i=0; i < N; i++)
        {
            if(my == a[i])
                count++;
        }

        b[idx]=count;
    }
}

int main()
{

    int threads = 9;
    int blocks = 1;
    int N = blocks*threads;
    float* h_a;
    int* h_b;
    float* d_a;
    int* d_b;

    h_a = (float*)malloc(N*sizeof(float));
    h_b = (int*)malloc(N*sizeof(int));

    cudaMalloc((void**)&d_a,N*sizeof(float));
    cudaMalloc((void**)&d_b,N*sizeof(int));

    h_a[0]= .333f; 
    h_a[1]= .324f;
    h_a[2]= .123f;
    h_a[3]= .543f;
    h_a[4]= .00054f;
    h_a[5]= .2243f;
    h_a[6]= .333f;
    h_a[7]= .53343f;
    h_a[8]= .4434f;

    cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);

    counter<<<blocks,threads>>>(d_a,d_b,N);

    cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);

    for(int i=0; i < N; i++)
    {
        printf("%f = %d times\n",h_a[i],h_b[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_b);
    getchar();
    return 0;
}

使用GPU获取数字列表的统计信息

3 个答案: