Question

我需要找到由 unsigned short 数组表示的数千（20,000+）个图像的平均值。你可以检查我，它找我这个代码不是最佳的：

我的内核：

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

内核包装器

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }`

我做了

float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);

for (int k = 0; k < count; k++) {           
 imageObjectList.at( curObj )->getImage( k );
 kernel_wrapper( avrg1, avrg2f, height * width, (float)count);  
}

结果可能是平均图像将在avrg2f;

谢谢。

Answer 1

如果图片大小相同，那么您的包装函数无需在每次调用时执行cudaMalloc和cudaFree操作。

预先分配所需的存储空间，并且不会在每次调用包装器时分配和释放它。

此外，如果您在主机端使用固定分配（cudaMemcpy）进行图像存储，则可能会看到~2x加速（对于cudaHostAlloc操作）。

最后，在循环期间，无需将结果复制回主机。完成计算平均值后执行此操作。这将保存您在包装器中执行的3个cudaMemcpy操作中的2个。

虽然我们正在使用它，但我认为使用memset初始化float数组是值得怀疑的。它适用于零值，但基本上没有其他值。此外，我希望将0.0作为memset的第二个参数传递给至少抛出编译器警告。

以下代码显示了上述优化，并演示了在我的测试用例中代码速度提高了8倍：

#include <stdio.h>
#include <sys/time.h>
#include <time.h>

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

__global__ void VecAdd2(unsigned short *A, float *B,  unsigned int Size, float mult){

  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] * mult + B[idx];
  }
}

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2,  unsigned int length, float my_mult)
{


    cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2,  length, my_mult );

    }

int main(){

  const int count = 2000;
  const int width = 1280;
  const int height = 960;
  timeval t1, t2;
  unsigned long et;

  unsigned short *h1_image;
  h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));

  float* avrg2f = (float*)malloc( width * height * sizeof(float));
  for (int i = 0; i<height*width; i++){
    h1_image[i] = (i%256);
    avrg2f[i] = 0.0f;
    }

  gettimeofday(&t1,NULL);
  for (int k = 0; k < count; k++) {
    kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
  }
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 1 = %ld us\n", et);
  unsigned short *h2_image;
  float* avrg3f = (float*)malloc( width * height * sizeof(float));
  cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
  for (int i = 0; i<height*width; i++){
    h2_image[i] = (i%256);
    avrg3f[i] = 0.0f;
    }
  gettimeofday(&t1,NULL);
  unsigned short *d_image;
  float *d_result;
  cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
  cudaMalloc((void **)&d_result, height*width*sizeof(float));
  cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
  for (int k = 0; k < count; k++) {
    kernel_wrapper2( h2_image, d_image,  d_result, height * width, (float)(1/(float)count));
  }
  cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 2 = %ld us\n", et);
  for (int i = 0; i < (height*width); i++)
    if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
  return 0;
}

与cuda的向量和数组

1 个答案: