Question

我正在尝试对我的第一个CUDA应用程序进行基准测试，该应用程序首先使用CPU然后使用GPU添加两个阵列。

这是程序。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include<iostream>
#include<chrono>

using namespace std;
using namespace std::chrono;

// add two arrays
void add(int n, float *x, float *y) {
    for (int i = 0; i < n; i++) {
        y[i] += x[i];
    }
}

__global__ void addParallel(int n, float *x, float *y) {

    int i = threadIdx.x;

    if (i < n)
        y[i] += x[i];
}

void printElapseTime(std::chrono::microseconds elapsed_time) {
    cout << "completed in " << elapsed_time.count() << " microseconds" << endl;
}

int main() {

    // generate two arrays of million float values each
    cout << "Generating two lists of a million float values ... ";

    int n = 1 << 28;

    float *x, *y;

    cudaMallocManaged(&x, sizeof(float)*n);
    cudaMallocManaged(&y, sizeof(float)*n);

    // begin benchmark array generation
    auto begin = high_resolution_clock::now();

    for (int i = 0; i < n; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // end benchmark array generation
    auto end = high_resolution_clock::now();

    auto elapsed_time = duration_cast<microseconds>(end - begin);

    printElapseTime(elapsed_time);

    // begin benchmark addition cpu
    begin = high_resolution_clock::now();

    cout << "Adding both arrays using CPU ... ";
    add(n, x, y);

    // end benchmark addition cpu
    end = high_resolution_clock::now();

    elapsed_time = duration_cast<microseconds>(end - begin);

    printElapseTime(elapsed_time);

    // begin benchmark addition gpu
    begin = high_resolution_clock::now();

    cout << "Adding both arrays using GPU ... ";
    addParallel << <1, 1024 >> > (n, x, y);

    cudaDeviceSynchronize();

    // end benchmark addition gpu
    end = high_resolution_clock::now();

    elapsed_time = duration_cast<microseconds>(end - begin);

    printElapseTime(elapsed_time);

    cudaFree(x);
    cudaFree(y);

    return 0;
}

令人惊讶的是，该程序正在生成以下输出。

Generating two lists of a million float values ... completed in 13343211 microseconds
Adding both arrays using CPU ... completed in 543994 microseconds
Adding both arrays using GPU ... completed in 3030147 microseconds

我想知道我到底要去哪里。为什么GPU计算要比CPU上运行的时间长6倍？

作为参考，我正在Intel i7 8750H和Nvidia GTX 1060上运行Windows 10。

Answer 1

请注意，您的统一内存阵列包含268个百万个浮点数，这意味着您在调用内核时将大约1 GB的数据传输到设备。使用GPU分析器（nvprof，nvvp或nsight），您应该会看到HtoD传输占用了大量的计算时间。

如何对CUDA程序进行基准测试？

1 个答案: