Question

我编写了两个程序来添加长度为 1024 的向量并运行它 1024 以查看哪个工作得更快。其中一个是基于 cuda 的，另一个不是。我原以为基于 cuda 的会做得更好，但事实并非如此。下面是两个程序。请看看我在这里做错了什么？我需要更长的阵列吗？ cuda 中任何更长的数组都会产生 cudaError。

非 cuda 代码：

#include <iostream>
#include <cstdlib>
#include <chrono>
#include <fstream>

using namespace std;

int main()
{
    const int addLen = 1024;
    const unsigned long addCount = 1024;
    
    double *timeStops = new double[addLen];
    
    int *arr1 = new int[addCount];
    int *arr2 = new int[addCount];
    int *arr3 = new int[addCount];

    int max = 100;

    for (int j = 0; j < addLen; j++) {
        auto tstart = chrono::high_resolution_clock::now();

        for (unsigned long i = 0; i < addCount; i++) {
            arr1[i] = rand() % max;
            arr2[i] = rand() % max;
        }

        for (unsigned long i = 0; i < addCount; i++) {
            arr3[i] = arr1[i] + arr2[i];
        }

        auto tend = chrono::high_resolution_clock::now();
        timeStops[j] = (tend - tstart).count();
    }

    delete[] arr1, arr2, arr3;
        
    ofstream outdata;
    outdata.open("outdata.dat");
    
    for (int j = 0; j < addLen; j++) {
        outdata << timeStops[j] << endl;
    }    

    outdata.close();

    delete[] timeStops;
}

Cuda 代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <chrono>
#include <fstream>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

using namespace std;

int main()
{
    const int addLen = 1024;
    const unsigned long arraySize = 10;

    double* timeStops = new double[addLen];

    int* a = new int[arraySize];
    int* b = new int[arraySize];
    int* c = new int[arraySize];

    for (int j = 0; j < addLen; j++) {
        for (unsigned long i = 0; i < arraySize; i++) {
            a[i] = rand() % 100;
            b[i] = rand() % 100;
        }

        auto tstart = chrono::high_resolution_clock::now();
        addWithCuda(c, a, b, arraySize);
        auto tend = chrono::high_resolution_clock::now();
        timeStops[j] = (tend - tstart).count();
    }

    cudaDeviceReset();

    delete[] a, b, c;

    ofstream outdata;
    outdata.open("outcudadata.dat");

    for (int j = 0; j < addLen; j++) {
        outdata << timeStops[j] << endl;
    }

    outdata.close();

    delete[] timeStops;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;

    
    cudaSetDevice(0);
    

    cudaMalloc((void**)&dev_c, size * sizeof(int));
    cudaMalloc((void**)&dev_a, size * sizeof(int));
    cudaMalloc((void**)&dev_b, size * sizeof(int));
    cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
    cudaDeviceSynchronize();
    cudaError_t cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

Answer 1

<块引用>

我原以为基于 cuda 的性能会更好，但事实并非如此。

这在意料之中。与内存带宽要求相比，算术强度如此低的运算在 GPU 上运行时可能无法胜过 CPU，尤其是当您在计时中包含进出 GPU 的传输时。

<块引用>

cuda 中任何更长的数组都会产生 cudaError。

有 1024 threads per block 的限制。为了解决更大的问题，您需要阅读有关块和网格尺寸标注的一些资料。

矢量加法基准测试

1 个答案: