Question

我正在尝试编写一个使用multi-gpu和推力库功能的CUDA内核。我使用了之前一些帖子中的一些提示。我试着编写一个简单的添加内核。我的明显意图是使用更复杂的内核。

我的代码如下：

#include "test.h"
int main(int argc, char *argv[])
{      
    int num_gpus = 0;   // number of CUDA GPUs
    // determine the number of CUDA capable GPUs
    cudaGetDeviceCount(&num_gpus);
    printf("number of CUDA devices:\t%d\n", num_gpus);

    typedef thrust::device_vector<int> dvec;
    typedef dvec *p_dvec;

    // Declaring Vectors
    std::vector<p_dvec> dvecs1;
    std::vector<p_dvec> dvecs2;
    std::vector<p_dvec> dvecs3;
    std::vector<double>p(num_gpus);
    dim3 DimGrid((DSIZE-1)/16.0 +1,1,1);
    dim3 DimBlock(16.0,1,1);

    // Initialize Vectors
    for(unsigned int i = 0; i < num_gpus; i++) {
        cudaSetDevice(i);
        p_dvec temp1 = new dvec(DSIZE);
        dvecs1.push_back(temp1);
        thrust::fill((*(dvecs1[i])).begin(),(*(dvecs1[i])).end(),1.0);
        p_dvec temp2 = new dvec(DSIZE);
        dvecs2.push_back(temp2);
        thrust::fill((*(dvecs2[i])).begin(),(*(dvecs2[i])).end(),2.0);
    }

  // Launching The Kernel
  for(unsigned int i = 0; i < num_gpus; i++) {
      cudaSetDevice(i);
      p_dvec temp = new dvec(DSIZE);
      dvecs3.push_back(temp);
      fooKernel<<<DimGrid,DimBlock>>>(convertToKernel(*dvecs1[i])),convertToKernel(*(dvecs2[i])),convertToKernel(*(dvecs3[i])));
      // Reduction Operation
      p[i]= thrust::reduce((*(dvecs3[i])).begin(),(*(dvecs3[i])).end(), (double) 0, thrust::plus<double>());
      std::cout<<*((*(dvecs3[i])).begin())<<std::endl;
      std::cout<<p[i]<<std::endl;
  }

  printf("Success\n");
  return 0;  
}

，头文件如下：

#include <stdio.h> 
#include <cstdio> 
#include <stdlib.h>
#include <cstdlib>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>   
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>

#define DSIZE 1048560                                  

template < typename T >                                                        
struct    KernelArray                                                             
{                                                                              
  T*  _array;                                                                
  int _size;                                                                 
};                                                 

// Function to convert device_vector to structure                              
template < typename T >                                                        
KernelArray< T > convertToKernel( thrust::device_vector< T >& dVec )           
{                                                                              
    KernelArray< T > kArray;                                                   
    kArray._array = thrust::raw_pointer_cast( &dVec[0] );                      
    kArray._size  = ( int ) dVec.size();                                       

    return kArray;                                                             
}                                                                              

template< typename scalartype>                                                 
__global__ void fooKernel( KernelArray< scalartype > Array1, KernelArray<scalartype>Array2, KernelArray<scalartype> Array3)
{
  size_t i = blockIdx.x * blockDim.x + threadIdx.x;                            
  if(i< DSIZE)
  Array3._array[i] = Array2._array[i] +Array1._array[i];                       

}

现在，如果DSIZE＆gt; 1048560，则结果为0; 我几乎没有问题：

1）如何确定向量的大小限制。我有8台设备。

2）有没有办法增加我可以使用的数据大小或改进代码？

3）我何时何地需要cudaDeviceSynchronize（）？

如果有人可以帮助我，我会很高兴。

Answer 1

如果您使用proper CUDA error checking来查明是否发生了CUDA错误，那么在使用fooKernel启动DSIZE > 1048560后，您将获得以下输出：

invalid argument

此错误的原因是您可以at most 65535 blocks in one dimension和

1048560/16 = 65535

因此，您没有遇到向量的大小限制，而是进入最大块限制。

使用Thrust库为多gpu编写CUDA内核时确定数组大小的限制

1 个答案: