我已经实现了两个版本的add。两者中的加法概念完全相同。唯一的区别是在一个代码中(下面的第一个代码)我使用全局内存,而对于第二个代码我使用共享内存。正如在几个地方提到的,共享内存版本应该更快但是对于我的情况,全局内存版本更快。 请告诉我出错的地方。注意:我有一个cc 2.1的cc。因此,对于共享内存,我有32个银行。由于我在示例中仅使用16个整数,因此我的代码不应存在银行冲突。 如果这是正确的,请告诉我。
全球版
#include<stdio.h>
__global__ void reductionGlobal(int* in, int sizeArray, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid < sizeArray ){
if(tid % (offset * 2 ) == 0){
in[tid] += in[tid+offset];
}
}
}
int main(){
int size = 16; // size of present input array. Changes after every loop iteration
int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
int* gidata;
cudaMalloc((void**)&gidata, size* sizeof(int));
cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
int offset = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(offset < size){
//use kernel launches to synchronize between different block. syncthreads() will not work
reductionGlobal<<<4,4>>>(gidata,size,offset);
offset *=2;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int* output = (int*)malloc( size * sizeof(int));
cudaMemcpy(output, gidata, size * sizeof(int), cudaMemcpyDeviceToHost);
printf("The sum of the array using only global memory is %d\n",output[0]);
getchar();
return 0;
}
共享内存版本:
#include<stdio.h>
__global__ void computeAddShared(int *in , int *out, int sizeInput){
extern __shared__ float temp[];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int ltid = threadIdx.x;
temp[ltid] = 0;
while(tid < sizeInput){
temp[ltid] += in[tid];
tid+=gridDim.x * blockDim.x; // to handle array of any size
}
__syncthreads();
int offset = 1;
while(offset < blockDim.x){
if(ltid % (offset * 2) == 0){
temp[ltid] = temp[ltid] + temp[ltid + offset];
}
__syncthreads();
offset*=2;
}
if(ltid == 0){
out[blockIdx.x] = temp[0];
}
}
int main(){
int size = 16; // size of present input array. Changes after every loop iteration
int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
int* gidata;
int* godata;
cudaMalloc((void**)&gidata, size* sizeof(int));
cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
int TPB = 4;
int blocks = 10; //to get things kicked off
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(blocks != 1 ){
if(size < TPB){
TPB = size; // size is 2^sth
}
blocks = (size+ TPB -1 ) / TPB;
cudaMalloc((void**)&godata, blocks * sizeof(int));
computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
cudaFree(gidata);
gidata = godata;
size = blocks;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int *output = (int*)malloc(sizeof(int));
cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
//Cant free either earlier as both point to same location
cudaFree(godata);
cudaFree(gidata);
printf("The sum of the array is %d\n", output[0]);
getchar();
return 0;
}
答案 0 :(得分:2)
这里有很多错误。首先,一些一般性评论:
至于实际的减少代码本身: