我在文件上有几个数字列表。例如,
.333, .324, .123 , .543, .00054
.2243, .333, .53343 , .4434
现在,我想获得使用GPU发生每个数字的次数。我相信这在GPU上比在CPU上更快,因为每个线程可以处理一个列表。我应该在GPU上使用什么数据结构来轻松获得上述计数。例如,对于上述内容,答案如下:
.333 = 2 times in entire file
.324 = 1 time
等。
我正在寻找一般解决方案。不仅适用于具有特定计算能力的设备
只需编写Pavan建议的内核,看看我是否有效地实现了它:
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int)); // stores the count of each unique element
int TPB = 256;
int blocks = uniqueEle + TPB -1 / TPB;
//Cast d_I to raw pointer called d_rawI
launch<<<blocks,TPB>>>(d_rawI,count,uniqueEle);
__global__ void launch(int *i, int* count, int n){
int id = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int indexes[256];
if(id < n ){
indexes[threadIdx.x] = i[id];
//as occurs between two blocks
if(id % 255 == 0){
count[indexes] = i[id+1] - i[id];
}
}
__syncthreads();
if(id < ele - 1){
if(threadIdx.x < 255)
count[id] = indexes[threadIdx.x+1] – indexes[threadIdx.x];
}
}
问题:如何修改此内核以便它处理任意大小的数组。即,当线程总数<1时处理条件。元素数量
答案 0 :(得分:5)
以下是我在matlab中执行代码的方法
A = [333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434];
[values, locations] = unique(A); % Find unique values and their locations
counts = diff([0, locations]); % Find the count based on their locations
在普通cuda中没有简单的方法可以做到这一点,但您可以使用现有的库来执行此操作。
1)Thrust
它还附带CUDA 4.0的CUDA工具包。
通过使用以下函数,可以将matlab代码粗略地转换为推力。我不太精通推力,但我只是想让你知道要看什么样的惯例。
float _A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int _I[] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
float *A, *I;
// Allocate memory on device and cudaMempCpy values from _A to A and _I to I
int num = 9;
// Values vector
thrust::device_vector<float>d_A(A, A+num);
// Need to sort to get same values together
thrust::stable_sort(d_A, d_A+num);
// Vector containing 0 to num-1
thrust::device_vector<int>d_I(I, I+num);
// Find unique values and elements
thrust::device_vector<float>d_Values(num), d_Locations(num), d_counts(num);
// Find unique elements
thrust::device_vector<float>::iterator valiter;
thrust::device_vector<int>::iterator idxiter;
thrust::pair<valiter, idxiter> new_end;
new_end = thrust::unique_by_key(d_A, d_A+num, d_I, d_Values, d_Locations);
现在,您拥有每个唯一值的第一个实例的位置。您现在可以启动内核以查找d_Locations中从0到new_end的相邻元素之间的差异。从num中减去最终值以获得最终位置的计数。
编辑(添加通过聊天提供的代码)
以下是需要完成差异代码的方法
#define MAX_BLOCKS 65535
#define roundup(A, B) = (((A) + (B) - 1) / (B))
int uniqueEle = newend.valiter – d_A;
int* count;
cudaMalloc((void**)&count, uniqueEle * sizeof(int));
int TPB = 256;
int num_blocks = roundup(uniqueEle, TPB);
int blocks_y = roundup(num_blocks, MAX_BLOCKS);
int blocks_x = roundup(num_blocks, blocks_y);
dim3 blocks(blocks_x, blocks_y);
kernel<<<blocks,TPB>>>(d_rawI, count, uniqueEle);
__global__ void kernel(float *i, int* count, int n)
{
int tx = threadIdx.x;
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int id = blockDim.x * bid + tx;
__shared__ int indexes[256];
if (id < n) indexes[tx] = i[id];
__syncthreads();
if (id < n - 1) {
if (tx < 255) count[id] = indexes[tx + 1] - indexes[tx];
else count[id] = i[id + 1] - indexes[tx];
}
if (id == n - 1) count[id] = n - indexes[tx];
return;
}
这是一个易于使用的 free 基于数组的库。
您可以在ArrayFire中执行以下操作。
using namespace af;
float h_A[] = {.333, .324, .123 , .543, .00054 .2243, .333, .53343 , .4434};
int num = 9;
// Transfer data to device
array A(9, 1, h_A);
array values, locations, original;
// Find the unique values and locations
setunique(values, locations, original, A);
// Locations are 0 based, add 1.
// Add *num* at the end to find count of last value.
array counts = diff1(join(locations + 1, num));
披露:我为AccelerEyes工作,开发此软件。
答案 1 :(得分:1)
要回答这个问题的最新问题 - 完成thrust方法proposed by Pavan的差异内核可能看起来像这样:
template<int blcksz>
__global__ void diffkernel(const int *i, int* count, const int n) {
int id = blockDim.x * blockIdx.x + threadIdx.x;
int strd = blockDim.x * gridDim.x;
int nmax = blcksz * ((n/blcksz) + ((n%blcksz>0) ? 1 : 0));
__shared__ int indices[blcksz+1];
for(; id<nmax; id+=strd) {
// Data load
indices[threadIdx.x] = (id < n) ? i[id] : n;
if (threadIdx.x == (blcksz-1))
indices[blcksz] = ((id+1) < n) ? i[id+1] : n;
__syncthreads();
// Differencing calculation
int diff = indices[threadIdx.x+1] - indices[threadIdx.x];
// Store
if (id < n) count[id] = diff;
__syncthreads();
}
}
答案 2 :(得分:-2)
这是一个解决方案:
__global__ void counter(float* a, int* b, int N)
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if(idx < N)
{
float my = a[idx];
int count = 0;
for(int i=0; i < N; i++)
{
if(my == a[i])
count++;
}
b[idx]=count;
}
}
int main()
{
int threads = 9;
int blocks = 1;
int N = blocks*threads;
float* h_a;
int* h_b;
float* d_a;
int* d_b;
h_a = (float*)malloc(N*sizeof(float));
h_b = (int*)malloc(N*sizeof(int));
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(int));
h_a[0]= .333f;
h_a[1]= .324f;
h_a[2]= .123f;
h_a[3]= .543f;
h_a[4]= .00054f;
h_a[5]= .2243f;
h_a[6]= .333f;
h_a[7]= .53343f;
h_a[8]= .4434f;
cudaMemcpy(d_a,h_a,N*sizeof(float),cudaMemcpyHostToDevice);
counter<<<blocks,threads>>>(d_a,d_b,N);
cudaMemcpy(h_b,d_b,N*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0; i < N; i++)
{
printf("%f = %d times\n",h_a[i],h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
getchar();
return 0;
}