Question

我有以下两个基本相同的示例代码。 code1.cu 使用cudaMalloc和cudaMemcpy处理设备/主机变量值交换。

code2.cu 使用cudaMallocManaged，因此不需要cudaMemcpy。使用cudaMallocManaged时，我必须包括cudaDeviceSynchronize()才能获得正确的结果，而对于具有cudaMalloc的结果，则不需要这样做。我希望了解为什么会发生这种情况

code2.cu

#include <iostream>
#include <math.h>
#include <vector>
//

using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMallocManaged((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);

    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    tot =*d_tot;
    //

    //
    cout<<" GPU: tot: "<<tot<<endl;
    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}

code1.cu

#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y*blockDim.y + threadIdx.y;
    if(i < max_x && j<max_y) {
        atomicAdd(tot, x[i] + y[j]);
    }
}


int main(void)
{
    int Nx = 1<<15;
    int Ny = 1<<15;
    float *d_x = NULL, *d_y = NULL;
    float *d_tot = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*Nx);
    cudaMalloc((void **)&d_y, sizeof(float)*Ny);
    cudaMalloc((void **)&d_tot, sizeof(float));

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < Nx; i++)
        vx.push_back(i);

    for (int i = 0; i < Ny; i++)
        vy.push_back(i*10);

    //
    float tot = 0;
    for(int i = 0; i<vx.size(); i++)
        for(int j = 0; j<vy.size(); j++)
            tot += vx[i] + vy[j];

    cout<<"CPU: tot: "<<tot<<endl;


    //
    cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);


    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

    //.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
    //.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
    int bx = sqrt(blockSize*Nx/(float)Ny);
    int by = bx*Ny/(float)Nx;
    dim3 blockSize_3D(bx, by);
    dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

    cout<<"blockSize: "<<blockSize<<endl;
    cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    tot = 0;
    add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    //cudaDeviceSynchronize();

    //
    cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);

    //
    cout<<" GPU: tot: "<<tot<<endl;

    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);
    cudaFree(d_tot);

    return 0;
}


//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0

删除cudaDeviceSynchronize()上的评论后，

GPU：总和：8.79609e + 12

Answer 1

CUDA内核启动是异步的。这意味着它们独立于启动它们的CPU线程执行。

由于这种异步启动，不能保证在您的CPU线程代码开始测试结果之前，CUDA内核就可以完成（甚至启动）。

因此，必须等到GPU内核完成后，cudaDeviceSynchronize()才能做到这一点。 cudaMemcpy也具有同步效果，因此，删除cudaMemcpy操作时，会丢失该同步，但是cudaDeviceSynchronize()会恢复它。

cudaMallocManaged和cudaDeviceSynchronize（）

1 个答案: