Question

我一直在尝试编写一个程序，使用GPU来计算使用高斯正交数值积分的积分。我一直试图弄清楚为什么这个程序不起作用。我认为我将其归结为函数调用d_one中传递的参数未正确复制到cuda c代码的事实。我不确定为什么会这样。我花了很多时间试图解决它，但我无法随心所欲。

以下是两个程序：

Fortran计划：

    implicit real*8(a-h,o-z)
    parameter ( nlinx = 22) ! Total number of mesh regions
    dimension sx(3*nlinx),swx(3*nlinx)

    xa = 0.d0
    xb = 5.d0
    ! In the following "nptx" is the total number of integration
    ! points. So, it is (nlinx * 3)
    call meshwt1(xa,xb,nlinx,ntan,sx,swx,nptx)

    ans0 = 0.d0

    CAll d_one(sx, swx, nptx, ans0)

    print *, ans0

    stop

    end

SUBROUTINE MESHWT1(A,B,N,NT,X,W,NTOT)
  implicit real*8(a-h,o-z)
  !3*N LINEAR POINTS FOR A TO B
  !NT=0 OR 1, 3*NT TAN PTS FOR B TO INFINITY
  !NTOT= 3*(N+NT)
  DIMENSION X(*),W(*),G(3),GW(3)
  G(1)  = -0.7745966
  G(2)  =  0.0000000
  G(3)  = -G(1)
  GW(2) = 0.8888888
  GW(1) = 0.5555555
  GW(3) = GW(1)
  Y = N
  DX = ( B - A ) / Y
  K = 0
  XA = A - DX
  XB = A
  DO 2 I = 1, N
  XA = XA + DX
  XB = XB + DX
  DO 2 J = 1, 3
  K = K + 1
  X(K) = 0.5 * ( XA + XB ) + 0.5 * ( XB - XA ) * G(J)
2 W(K) = 0.5 * ( XB - XA ) * GW(J)
  NTOT = K
  IF( NT .EQ. 1 )  GO TO 3
  GO TO 5
3 NTOT = K + 3
  DO 4 J = 1, 3
  K = K + 1
  Y = ( 1.0 + G(J) ) * 3.14159 * 0.25
  X(K) = XB + DTAN(Y)
4 W(K) = GW(J) * 3.14159 * 0.25 / ( DCOS(Y) ) ** 2
5 CONTINUE
  RETURN
  END

CUDA计划：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void loop_d(float *a, float *b, int N, float *ans)
{
    __shared__ float temp[66];
    int idx = threadIdx.x;
    if (idx < 66)
    {
              temp[idx] = a[idx] * b[idx];
    }
    __syncthreads();
    if (0 == idx)
    {
            float sum = 0.0;
            for (int i=0; i < 66; i++)
            {
                    sum += temp[i];
            }
            *ans = sum;
    }
}
// The following function is called from the Fortran program
extern "C" void d_one_(float *a, float *b, int *Np, float *ans)
{
    float *a_d, *b_d, *ans_d; // Declaring GPU Copies of the parameters passed
    int blocks = 1; // Number of blocks used
    int N = *Np;  // Number of threads is determined by the parameter nptx passed from the Fortran program

    // Allocating GPU memory
    cudaMalloc( (void **)&a_d, sizeof(float) * N);
    cudaMalloc( (void **)&b_d, sizeof(float) * N);
    cudaMalloc( (void **)&ans_d, sizeof(float) );
    // Copying information from CPU to GPU
    cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice );
    cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice );
    cudaMemcpy( ans_d, ans, sizeof(float), cudaMemcpyHostToDevice );
    // Calling the function on the GPU
    loop_d<<< blocks, N >>>(a_d, b_d, N, ans_d);
    cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
    cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
    cudaMemcpy( ans, ans_d, sizeof(float), cudaMemcpyDeviceToHost );

    // Freeing GPU memory
    cudaFree(a_d);
    cudaFree(b_d);
    cudaFree(ans_d);
    return;
}

程序的输出应为12.49999。我得到了-314的答案。感谢您提供的任何输入！

Answer 1

决定是否要使用单精度或双精度浮点变量。

目前，您在Fortran端使用双精度real*8，在C（++）端使用单精度float。

同时使用real*4和float，或real*8和double。

变量从Fortran错误地复制到cuda c程序

1 个答案: