Question

我有2000个2D阵列（每个阵列是1000x1000）我需要计算每个阵列的平均值并将结果放在一个2000矢量中。

我尝试通过为每个2D数组调用内核来做到这一点，但它很天真我想立即进行计算。

这就是我所做的是一个2D阵列的内核。我想让内核在一个内核中为2000个2D数组做这个。

#include <stdio.h>
#include <cuda.h>
#include <time.h>

void init_mat(float *a, const int N, const int M);
void print_mat(float *a, const int N, const int M, char *d);
void print_array(float *a, const int N, char *d);









const int threadsPerBlock=256;

__global__
void kernel(float *mat, float *out, const int N, const int M){

    __shared__ float cache[threadsPerBlock];

    int tid=threadIdx.x+blockIdx.x*blockDim.x;
    int cacheIndex = threadIdx.x;
    float sum=0;



    if(tid<M){
        for(int i=0; i<N; i++)
            sum += mat[(i*M)+tid];
            cache[cacheIndex] = sum;
            out[tid] =cache[cacheIndex];
    }

    __syncthreads();

    int i = blockDim.x/2;
    while(i!=0){
        if(cacheIndex<i)
            cache[cacheIndex]+= cache[cacheIndex +i];
        __syncthreads();
        i/=2;
    }

   if(cacheIndex==0)
            out[blockIdx.x]=cache[0];


}




int main (void) {


    srand( time(NULL) );

            float *a, *b, *c;
                float *dev_a, *dev_b, *dev_c;

            int N=1000;
            int M=1000;

            b=(float*)malloc(sizeof(float)*N*M);
            c=(float*)malloc(sizeof(float)*M);

                init_mat(b, N, M);

            printf("<<<<<<<<<< initial data:\n");

                print_mat(b, N, M, "matrix");



                cudaMalloc((void**)&dev_b, sizeof(float)*N*M);
                cudaMalloc((void**)&dev_c, sizeof(float)*M);


                cudaMemcpy(dev_b, b, sizeof(float)*N*M, cudaMemcpyHostToDevice);

            printf("\n\nRunning Kernel...\n\n");
                kernel<<<M/256+1, 256>>>(dev_b, dev_c, N, M);


                cudaMemcpy(c, dev_c, sizeof(float)*M, cudaMemcpyDeviceToHost);

                cudaFree(dev_a);
                cudaFree(dev_b);
                cudaFree(dev_c);

            printf(">>>>>>>>>> final data:\n");
                print_array(c, M, "out-vector");
};




void init_mat(float *a, const int N, const int M) {
        int i, j;
        for(i=0; i<N; i++)
            for(j=0; j<M; j++)
                    a[i*M+j] = rand() % 100 + 1;
}
void print_mat(float *a, const int N, const int M, char *d) {
        int i, j;
        for(i=0; i<N; i++){
        printf("\n%s[%d]:", d, i);
        for (j=0; j<M; j++)
                    printf("\t%6.4f", a[i*M+j]);
    }
    printf("\n");
}

void print_array(float *a, const int N, char *d) {
        int i;
        for(i=0; i<N; i++)
                printf("\n%s[%d]: %f",d, i, a[i]);
    printf("\n");
}

任何帮助谢谢

Answer 1

对于相当大数量的阵列（例如2000）和相当大的阵列（例如2000），如果我们分配块以对每个阵列执行总和减少（和平均计算），则GPU可以是相当有效的。这意味着如果您有2000个阵列，我们将启动2000个块。

为了处理每个块具有固定线程数的任意大小的数组，我们将使用类似grid-striding loop的想法，但我们将使每个块使用块跨越循环加载与特定数组关联的所有数据。这意味着每个块的线程将会＃34; stride＆＃34;通过指定的数组，加载该数组的所有元素。

除此之外，主要还原操作类似于你所写的，并且平均值的计算是微不足道的 - 我们可以在将结果写入全局存储器之前计算平均值，一旦我们通过还原计算了总和

这是一个有效的例子。如果使用-DMEAN进行编译，代码将输出每个数组的平均值。如果省略该编译开关，代码将输出每个数组的总和。让N为数组的数量，让K为每个数组的大小。

$ cat t1285.cu
#include <stdio.h>

const size_t N = 1000; // number of arrays
const size_t K = 1000; // size of each array
const int nTPB = 256; // number of threads per block,  must be a power-of-2
typedef float mytype; // type of data to be summed

// produce the sum or mean of each array
template <typename T>
__global__ void breduce(const T * __restrict__ idata, T * __restrict__ odata, const int bsize){

  __shared__ T sdata[nTPB];

  T sum = 0;
  //block-striding loop
  size_t offset = blockIdx.x*bsize + threadIdx.x;
  while (offset < (blockIdx.x+1)*bsize){
    sum += idata[offset];
    offset += blockDim.x;}
  sdata[threadIdx.x] = sum;
  __syncthreads();
  //shared memory reduction sweep
  for (int i = nTPB>>1; i > 0; i>>=1){
    if (threadIdx.x < i) sdata[threadIdx.x] += sdata[threadIdx.x+i];
    __syncthreads();}
  // write output sum for this block/array
#ifndef MEAN
  if (!threadIdx.x) odata[blockIdx.x] = sdata[0];
#else
  if (!threadIdx.x) odata[blockIdx.x] = sdata[0]/bsize;
#endif
}

int main(){

  mytype *h_idata, *h_odata, *d_idata, *d_odata;
  h_idata=(mytype *)malloc(N*K*sizeof(mytype));
  h_odata=(mytype *)malloc(N*sizeof(mytype));
  cudaMalloc(&d_idata, N*K*sizeof(mytype));
  cudaMalloc(&d_odata, N*sizeof(mytype));
  for (size_t i = 0; i < N; i++)
    for (size_t j = 0; j < K; j++)
      h_idata[i*K+j] = 1 + (i&1); // fill alternating arrays with 1 and 2
  memset(h_odata, 0, N*sizeof(mytype));  // zero out
  cudaMemset(d_odata, 0, N*sizeof(mytype)); // zero out
  cudaMemcpy(d_idata, h_idata, N*K*sizeof(mytype), cudaMemcpyHostToDevice);
  breduce<<<N, nTPB>>>(d_idata, d_odata, K);
  cudaMemcpy(h_odata, d_odata, N*sizeof(mytype), cudaMemcpyDeviceToHost);
  // validate
  for (size_t i = 0; i < N; i++)
#ifndef MEAN
    if (h_odata[i] != (K*(1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)(K*(1 + (i&1)))); return 1;}
#else
    if (h_odata[i] != ((1 + (i&1)))) {printf("mismatch at %d, was: %f, should be: %f\n", i, (float)h_odata[i], (float)((1 + (i&1)))); return 1;}
#endif
  return 0;
}

$ nvcc -arch=sm_35 -o t1285 t1285.cu -DMEAN
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvcc -arch=sm_35 -o t1285 t1285.cu
$ cuda-memcheck ./t1285
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

使用CUDA C计算2000个2D阵列的平均值

1 个答案: