Question

首先，我对CUDA编程很新，所以我为这么简单的问题道歉。我已经研究了在我的GPU内核调用中确定dimGrid和dimBlock的最佳方法，并且出于某些原因我还没有完全开始工作。

在我的家用电脑上，我有 GeForce GTX 580 （ Compute Capability 2.0 ）。每块1024个线程等。我可以让我的代码在这台PC上正常运行。我的gpu填充了988 * 988的距离数组。以下是代码的一部分：

#define SIZE 988

__global__ void createDistanceTable(double *d_distances, double *d_coordinates)  
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

if(row < SIZE && col < SIZE)
    d_distances[row * SIZE + col] = 
    acos(__sinf(d_coordinates[row * 2 + 0])*
   __sinf(d_coordinates[col * 2 + 0])+__cosf(d_coordinates[row * 2 + 0])*
   __cosf(d_coordinates[col * 2 + 0])*__cosf(d_coordinates[col * 2 + 1]-
   d_coordinates[row * 2 + 1]))*6371;
}

主要调用内核：

dim3 dimBlock(32,32,1);
dim3 dimGrid(32,32,1);
createDistanceTable<<<dimGrid, dimBlock>>>(d_distances, d_coordinates);

我的问题是我根本找不到让我的笔记本电脑上的代码正常运行的方法。我的笔记本电脑的GPU是 GeForce 9600M GT （计算能力1.1 ）。每块512个线程等。我非常感谢任何帮助我了解如何在笔记本电脑上接近dimBlock和dimGrid进行内核调用的指导。谢谢你的建议！

Answer 1

您的代码中出现了一些问题。

在CC上使用双精度＆lt; 1.3。
您的线程块的大小（正如您所说，CC <= 1.3表示每个块最多512个线程，每个块使用1024个线程）。如果你确实需要一些多架构代码，我想你可以使用__CUDA_ARCH__。
无错误检查或内存检查（cuda-memcheck）。您可以分配比您拥有的内存更多的内存，或者使用比GPU可以处理的更多的线程/块，并且您将无法检测到它。

根据您的代码考虑以下示例（我使用float代替double）：

#include <cuda.h>
#include <stdio.h>      // printf

#define SIZE 988
#define GRID_SIZE 32
#define BLOCK_SIZE 16 // set to 16 instead of 32 for instance

#define CUDA_CHECK_ERROR() __cuda_check_errors(__FILE__, __LINE__)
#define CUDA_SAFE_CALL(err) __cuda_safe_call(err, __FILE__, __LINE__)

// See: http://codeyarns.com/2011/03/02/how-to-do-error-checking-in-cuda/
inline void
__cuda_check_errors (const char *filename, const int line_number)
{
  cudaError err = cudaDeviceSynchronize ();
  if (err != cudaSuccess)
    {
      printf ("CUDA error %i at %s:%i: %s\n",
          err, filename, line_number, cudaGetErrorString (err));
      exit (-1);
    }
}

inline void
__cuda_safe_call (cudaError err, const char *filename, const int line_number)
{
  if (err != cudaSuccess)
    {
      printf ("CUDA error %i at %s:%i: %s\n",
          err, filename, line_number, cudaGetErrorString (err));
      exit (-1);
    }
}

__global__ void
createDistanceTable (float *d_distances, float *d_coordinates)
{
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  int row = blockIdx.y * blockDim.y + threadIdx.y;

  if (row < SIZE && col < SIZE)
    d_distances[row * SIZE + col] =
      acos (__sinf (d_coordinates[row * 2 + 0]) *
        __sinf (d_coordinates[col * 2 + 0]) +
        __cosf (d_coordinates[row * 2 + 0]) *
        __cosf (d_coordinates[col * 2 + 0]) *
        __cosf (d_coordinates[col * 2 + 1] -
            d_coordinates[row * 2 + 1])) * 6371;
}

int
main ()
{
  float *d_distances;
  float *d_coordinates;

  CUDA_SAFE_CALL (cudaMalloc (&d_distances, SIZE * SIZE * sizeof (float)));
  CUDA_SAFE_CALL (cudaMalloc (&d_coordinates, SIZE * SIZE * sizeof (float)));

  dim3 dimGrid (GRID_SIZE, GRID_SIZE);
  dim3 dimBlock (BLOCK_SIZE, BLOCK_SIZE);
  createDistanceTable <<< dimGrid, dimBlock >>> (d_distances, d_coordinates);

  CUDA_CHECK_ERROR ();

  CUDA_SAFE_CALL (cudaFree (d_distances));
  CUDA_SAFE_CALL (cudaFree (d_coordinates));
}

编译命令（相应地更改架构）：

nvcc prog.cu -g -G -lineinfo -gencode arch = compute_11，code = sm_11 -o prog

在CC 2.0上使用32x32块，在CC 1.1上使用16x16：

cuda-memcheck ./prog 
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors

在CC 2.0上使用33x33块或在CC 1.1上使用32x32块：

cuda-memcheck ./prog 
========= CUDA-MEMCHECK
========= Program hit error 9 on CUDA API call to cudaLaunch 
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/nvidia-current-updates/libcuda.so [0x26a230]
========= Host Frame:/opt/cuda/lib64/libcudart.so.5.0 (cudaLaunch + 0x242) [0x2f592]
========= Host Frame:./prog [0xc76]
========= Host Frame:./prog [0xa99]
========= Host Frame:./prog [0xac4]
========= Host Frame:./prog [0x9d1]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xed) [0x2176d]
========= Host Frame:./prog [0x859]
=========
========= ERROR SUMMARY: 1 error

错误9：

/**
* This indicates that a kernel launch is requesting resources that can
* never be satisfied by the current device. Requesting more shared memory
* per block than the device supports will trigger this error, as will
* requesting too many threads or blocks. See ::cudaDeviceProp for more
* device limitations.
*/ cudaErrorInvalidConfiguration         =      9,

在CUDA中确定dimGrid和dimBlock大小

1 个答案: