在GPU中分享许多高斯 - 勒让德正交的根和权重

时间:2015-03-03 00:05:17

标签: c++ cuda gpu-programming

我打算以并行的方式计算大量的数字方形,这些数字方形在一天结束时使用一组共同的数据用于所有计算(一个相当大的根和重量阵列,大约25 Kb的内存)。高斯 - 勒让德正交方法很简单。我想通过声明设备 double * d_droot,* d_dweight来提供设备中的所有线程,根和权重。但我遗漏了一些东西,因为我必须明确地传递指向数组的指针,以使我的内核运行良好。我怎么能正确地做到这一点?更重要的是,为了在设备上提供更多可用内存,是否可以将根和权重刻录到设备内存的某个恒定部分?

附上代码

#include <math.h>
#include <stdlib.h>
#include <stdio.h>


__device__  double *d_droot, *d_dweight;


__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}

__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
    }
}

__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  extern __shared__ double *d_droot;
  extern __shared__ double *d_dweight;
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f \n", d_dweight[dummy]);
    }
    }
}


int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);


  // Perform SAXPY on 1M element

  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */





  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost); 

  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures \n", maxError, N);
  printf("integral: %f  \n" ,dresult[0]);



  cudaFree(dresult);
  cudaFree(d_droot);
  cudaFree(d_dweight);

}

和编译它的makefile:

objects = main.o 

all: $(objects)
        nvcc   -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss

%.o: %.cpp
        nvcc -x cu -arch=sm_20  -I. -dc $< -o $@

clean:
        rm -f *.o gauss

提前感谢任何建议

1 个答案:

答案 0 :(得分:1)

您对d_drootd_dweight的处理有各种各样的错误。当我编译你的代码时,我得到了各种各样的警告:

t640.cu(86): warning: address of a __shared__ variable "d_droot" cannot be directly taken in a host function

t640.cu(87): warning: address of a __shared__ variable "d_dweight" cannot be directly taken in a host function

t640.cu(108): warning: a __shared__ variable "d_droot" cannot be directly read in a host function

t640.cu(109): warning: a __shared__ variable "d_dweight" cannot be directly read in a host function

不容忽视。

  1. 这些声明:

    __device__  double *d_droot, *d_dweight;
    

    不要定义__shared__变量,所以这些行:

    extern __shared__ double *d_droot;
    extern __shared__ double *d_dweight;
    

    毫无意义。此外,如果您确实希望这些是dynamically allocated shared variablesextern __shared__用于什么),则需要将分配大小作为第3个内核启动参数传递,您不会这样做。

  2. 这些陈述不正确:

    cudaMalloc(&d_droot, N_nodes*sizeof(double));
    cudaMalloc(&d_dweight, N_nodes*sizeof(double));
    

    您无法在主机代码中获取__device__变量的地址,我们也无法使用cudaMalloc分配__device__变量;根据定义,它是静态分配。

  3. 我建议做正确的cuda错误检查。作为快速测试,您还可以使用cuda-memcheck运行代码。这两种方法都表明代码中存在运行时错误(尽管不是任何问题的关键)。

  4. 这些陈述也不正确:

    cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
    

    cudaMemcpy not the correct API to use__device__个变量。请改用cudaMemcpyToSymbol

  5. 以下代码修复了这些不同的使用错误,将干净地编译,并且似乎正确运行。它表明没有必要将__device__变量作为内核参数传递:

    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    
    __device__  double *d_droot, *d_dweight;
    
    
    __device__ __host__
    double f(double alpha,double x)
    {
      /*function to be integrated via gauss-legendre quadrature. */
      return exp(alpha*x);
    }
    
    __global__
    void lege_inte2(int n, double alpha, double a, double b, double *result)
    {
      /*
        Parameters:
        n: Total number of quadratures
        a: Upper integration limit
        b: Lower integration limit
        lroots[]: roots for the quadrature
        weight[]: weights for the quadrature
        result[]: allocate the results for N quadratures.
       */
      double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
      int dummy;
    
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      if (i < n)
        {
          result[i] = 0.0;
          for (dummy = 0; dummy < 5; dummy++)
        result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
        }
    }
    
    __global__
    void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
    {
      /*
        Parameters:
        n: Total number of quadratures
        a: Upper integration limit
        b: Lower integration limit
        d_root[]: roots for the quadrature
        d_weight[]: weights for the quadrature
        result[]: allocate the results for N quadratures.
       */
      double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
      int dummy;
    
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      if (i < n)
        {
          result[i] = 0.0;
          for (dummy = 0; dummy < 5; dummy++)
        {
          result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
          printf(" Vale: %f \n", d_dweight[dummy]);
        }
        }
    }
    
    
    int main(void)
    {
      int N = 1<<23;
      int N_nodes = 5;
    
    
      double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
    
    
      /*double version in host*/
      droot =(double*)malloc(N_nodes*sizeof(double));
      dweight =(double*)malloc(N_nodes*sizeof(double));
      dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
    
    
      /*double version in device*/
      cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
      cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
      cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
    
    
      /*double version of the roots and weights*/
      droot[0] = 0.90618;
      droot[1] = 0.538469;
      droot[2] = 0.0;
      droot[3] = -0.538469;
      droot[4] = -0.90618;
    
    
      dweight[0] = 0.236927;
      dweight[1] = 0.478629;
      dweight[2] = 0.568889;
      dweight[3] = 0.478629;
      dweight[4] = 0.236927;
    
    
    
      /*double copy host-> device*/
      cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
      cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
      cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
    
      // Perform SAXPY on 1M element
    
      lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
      //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */
    
    
    
    
    
      cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
    
      double maxError = 0.0f;
      for (int i = 0; i < N; i++)
        maxError = max(maxError, abs(dresult[i]-20.03574985));
      printf("Max error: %f in %i quadratures \n", maxError, N);
      printf("integral: %f  \n" ,dresult[0]);
    
    
    
      cudaFree(d_dresult);
      cudaFree(d_droot_temp);
      cudaFree(d_dweight_temp);
    
    }
    

    (我无法保证结果。)

    现在,关于这个问题:

      

    更重要的是,为了在设备上提供更多可用内存,是否可以将根和权重刻录到设备内存的某个恒定部分?

    由于您d_dweightd_droot的访问权限似乎是统一的:

    result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
    

    然后它may be useful to define these作为__constant__内存空间变量。当warp中的每个线程在常量内存中请求相同的值(相同位置)时,常量内存访问是最佳的。但是,__constant__内存不能动态分配,将指针(仅)存储在常量内存中是没有意义的;这并没有提供常量缓存机制的任何好处。

    因此,对代码的以下进一步修改演示了如何将这些值存储在__constant__内存中,但它需要静态分配。此外,这并不是真正的“保存”#34;任何设备内存。无论您使用cudaMalloc动态分配,静态分配__device__变量,还是通过__constant__变量定义(也是静态分配),所有这些方法都需要全局内存支持存储在设备内存(板载DRAM)中。

    代码演示可能的常量内存使用情况:

    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    #define N_nodes 5
    
    __constant__   double d_droot[N_nodes], d_dweight[N_nodes];
    
    
    __device__ __host__
    double f(double alpha,double x)
    {
      /*function to be integrated via gauss-legendre quadrature. */
      return exp(alpha*x);
    }
    
    __global__
    void lege_inte2(int n, double alpha, double a, double b, double *result)
    {
      /*
        Parameters:
        n: Total number of quadratures
        a: Upper integration limit
        b: Lower integration limit
        lroots[]: roots for the quadrature
        weight[]: weights for the quadrature
        result[]: allocate the results for N quadratures.
       */
      double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
      int dummy;
    
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      if (i < n)
        {
          result[i] = 0.0;
          for (dummy = 0; dummy < 5; dummy++)
        result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
        }
    }
    
    __global__
    void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
    {
      /*
        Parameters:
        n: Total number of quadratures
        a: Upper integration limit
        b: Lower integration limit
        d_root[]: roots for the quadrature
        d_weight[]: weights for the quadrature
        result[]: allocate the results for N quadratures.
       */
      double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
      int dummy;
    
      int i = blockIdx.x*blockDim.x + threadIdx.x;
      if (i < n)
        {
          result[i] = 0.0;
          for (dummy = 0; dummy < 5; dummy++)
        {
          result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
          printf(" Vale: %f \n", d_dweight[dummy]);
        }
        }
    }
    
    
    int main(void)
    {
      int N = 1<<23;
     // int N_nodes = 5;
    
    
      double *droot, *dweight, *dresult, *d_dresult;
    
    
      /*double version in host*/
      droot =(double*)malloc(N_nodes*sizeof(double));
      dweight =(double*)malloc(N_nodes*sizeof(double));
      dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
    
    
      /*double version in device*/
      cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
    
    
      /*double version of the roots and weights*/
      droot[0] = 0.90618;
      droot[1] = 0.538469;
      droot[2] = 0.0;
      droot[3] = -0.538469;
      droot[4] = -0.90618;
    
    
      dweight[0] = 0.236927;
      dweight[1] = 0.478629;
      dweight[2] = 0.568889;
      dweight[3] = 0.478629;
      dweight[4] = 0.236927;
    
    
    
      /*double copy host-> device*/
      cudaMemcpyToSymbol(d_droot, droot, N_nodes*sizeof(double));
      cudaMemcpyToSymbol(d_dweight, dweight, N_nodes*sizeof(double));
    
      // Perform SAXPY on 1M element
    
      lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
      //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */
    
    
    
    
    
      cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
    
      double maxError = 0.0f;
      for (int i = 0; i < N; i++)
        maxError = max(maxError, abs(dresult[i]-20.03574985));
      printf("Max error: %f in %i quadratures \n", maxError, N);
      printf("integral: %f  \n" ,dresult[0]);
    
    
    
      cudaFree(d_dresult);
    
    }