如何在CUDA中减少64位数组的向量?
我的代码给了我预期答案的一半。
__global__ void Reduce(double* in3,double* r,int size)
{
int id=blockIdx.x*blockDim.x + threadIdx.x;
extern __shared__ double shareddata3[];
int tid=threadIdx.x;
if(id<size) {
shareddata3[tid] =in3[id];
}
__syncthreads();
for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
if (tid < s3) {
shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
}
__syncthreads();
}
if(tid==0) {
r[0]=shareddata3[0];
}
}
我的kernerl发布时间是:
Reduce<<<1,64,sharedmem3>>>(d_array,g,64);
答案 0 :(得分:1)
错误是您未向我们展示的代码的一部分。这是您的代码的完整可编译示例。
#include "cuda_runtime.h"
#include <iostream>
using namespace std;
const int size(64);
__global__ void Reduce(double* in3,double* r,int size);
#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
int main()
{
double* result_d;
assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));
double* result_h;
assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));
double* in3_d;
assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));
double* in3_h;
assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));
double expected_result(0);
for (int i(0); i < size; ++i) {
in3_h[i] = i;
expected_result += i;
}
cout << "Expected result: " << expected_result << endl;
assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));
Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);
assertCudaSuccess(cudaPeekAtLastError());
assertCudaSuccess(cudaDeviceSynchronize());
assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));
cout << "Actual result: " << *result_h << endl;
assertCudaSuccess(cudaFree(result_d));
assertCudaSuccess(cudaFreeHost(result_h));
assertCudaSuccess(cudaFree(in3_d));
assertCudaSuccess(cudaFreeHost(in3_h));
cin.get();
return 0;
}
__global__ void Reduce(double* in3, double* r, int size)
{
int id=blockIdx.x*blockDim.x + threadIdx.x;
extern __shared__ double shareddata3[];
int tid=threadIdx.x;
if(id<size) {
shareddata3[tid] =in3[id];
}
__syncthreads();
for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
if (tid < s3) {
shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
}
__syncthreads();
}
if(tid==0) {
r[0] = shareddata3[0];
}
}
输出:
Expected result: 2,016
Actual result: 2,016