如何在CUDA中减少64位数组的向量?

时间:2013-12-11 23:41:20

标签: vector cuda reduction

如何在CUDA中减少64位数组的向量?

我的代码给了我预期答案的一半。

__global__ void Reduce(double* in3,double* r,int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0]=shareddata3[0];
  }
}

我的kernerl发布时间是:

Reduce<<<1,64,sharedmem3>>>(d_array,g,64);

1 个答案:

答案 0 :(得分:1)

错误是您未向我们展示的代码的一部分。这是您的代码的完整可编译示例。

#include "cuda_runtime.h"

#include <iostream>
using namespace std;

const int size(64);

__global__ void Reduce(double* in3,double* r,int size);

#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int main()
{
  double* result_d;
  assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));

  double* result_h;
  assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));

  double* in3_d;
  assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));

  double* in3_h;
  assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));

  double expected_result(0);
  for (int i(0); i < size; ++i) {
    in3_h[i] = i;
    expected_result += i;
  }
  cout << "Expected result: " << expected_result << endl;

  assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));

  Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);

  assertCudaSuccess(cudaPeekAtLastError());
  assertCudaSuccess(cudaDeviceSynchronize());

  assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));

  cout << "Actual result: " << *result_h << endl;

  assertCudaSuccess(cudaFree(result_d));
  assertCudaSuccess(cudaFreeHost(result_h));
  assertCudaSuccess(cudaFree(in3_d));
  assertCudaSuccess(cudaFreeHost(in3_h));

  cin.get();
  return 0;
}

__global__ void Reduce(double* in3, double* r, int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0] = shareddata3[0];
  }
}

输出:

Expected result: 2,016
Actual result: 2,016