我正在努力使代码并行运行(CUDA)。简化的代码是:
float sum = ... //sum = some number
for (i = 0; i < N; i++){
f = ... // f = a function that returns a float and puts it into f
sum += f;
}
我遇到的问题是sum+=f
,因为它需要在线程之间共享sum
。我在声明sum(__shared__
)时尝试使用__shared__ float sum
参数,但这不起作用(它没有给我正确的结果)。我也听说过减少(并知道如何在OpenMP上使用它),但不知道如何在这里应用它。
非常感谢任何帮助。谢谢!
答案 0 :(得分:5)
以下是代码:
#include <stdio.h>
__global__ void para(float* f, int len) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < len ){
// calculate f[i], f in iteration ith
f[i] = i;
}
}
int main(int argc, char ** argv) {
int inputLength=1024;
float * h_f;
float * d_f;
int size = inputLength*sizeof(float);
h_f = (float *) malloc(size);
cudaMalloc((void**)&d_f , size);
cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);
dim3 DimGrid((inputLength)/256 +1 , 1 , 1);
dim3 DimBlock(256 , 1, 1);
para<<<DimGrid , DimBlock>>>(d_f , inputLength);
cudaThreadSynchronize();
cudaMemcpy(h_f, d_f, size , cudaMemcpyDeviceToHost);
cudaFree(d_f);
// do parallel reduction
int i;
float sum=0;
for(i=0; i<inputLength; i++)
sum+=h_f[i];
printf("%6.4f\n",sum);
free(h_f);
return 0;
}
并行缩减部分可以用工作的CUDA并行和减少代替(例如this one)。很快我就会花时间改变它。
修改强>
以下是使用CUDA执行并行缩减的代码:
#include <stdio.h>
__global__ void para(float* f, int len) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < len ){
// calculate f[i], f in iteration ith
f[i] = i;
}
}
__global__ void strideSum(float *f, int len, int strid){
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i+strid<len){
f[i]=f[i]+f[i+strid];
}
}
#define BLOCKSIZE 256
int main(int argc, char ** argv) {
int inputLength=4096;
float * h_f;
float * d_f;
int size = inputLength*sizeof(float);
h_f = (float *) malloc(size);
cudaMalloc((void**)&d_f , size);
cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);
dim3 DimGrid((inputLength)/BLOCKSIZE +1 , 1 , 1);
dim3 DimBlock(BLOCKSIZE , 1, 1);
para<<<DimGrid , DimBlock>>>(d_f , inputLength);
cudaThreadSynchronize();
int i;
float sum=0, d_sum=0;
// serial sum on host. YOU CAN SAFELY COMMENT FOLLOWING COPY AND LOOP. intended for sum validity check.
cudaMemcpy(h_f, d_f, size , cudaMemcpyDeviceToHost);
for(i=0; i<inputLength; i++)
sum+=h_f[i];
// parallel reduction on gpu
for(i=inputLength; i>1; i=i/2){
strideSum<<<((i/BLOCKSIZE)+1),BLOCKSIZE>>>(d_f,i,i/2);
cudaThreadSynchronize();
}
cudaMemcpy(&d_sum, d_f, 1*sizeof(float) , cudaMemcpyDeviceToHost);
printf("Host -> %6.4f, Device -> %6.4f\n",sum,d_sum);
cudaFree(d_f);
free(h_f);
return 0;
}
答案 1 :(得分:1)
您想要的是将数字范围映射到线程,让每个线程添加其范围,然后进行缩减阶段。减少将添加每个线程的总数。