我编写了两个程序来添加长度为 1024 的向量并运行它 1024 以查看哪个工作得更快。其中一个是基于 cuda 的,另一个不是。我原以为基于 cuda 的会做得更好,但事实并非如此。下面是两个程序。请看看我在这里做错了什么?我需要更长的阵列吗? cuda 中任何更长的数组都会产生 cudaError。
非 cuda 代码:
#include <iostream>
#include <cstdlib>
#include <chrono>
#include <fstream>
using namespace std;
int main()
{
const int addLen = 1024;
const unsigned long addCount = 1024;
double *timeStops = new double[addLen];
int *arr1 = new int[addCount];
int *arr2 = new int[addCount];
int *arr3 = new int[addCount];
int max = 100;
for (int j = 0; j < addLen; j++) {
auto tstart = chrono::high_resolution_clock::now();
for (unsigned long i = 0; i < addCount; i++) {
arr1[i] = rand() % max;
arr2[i] = rand() % max;
}
for (unsigned long i = 0; i < addCount; i++) {
arr3[i] = arr1[i] + arr2[i];
}
auto tend = chrono::high_resolution_clock::now();
timeStops[j] = (tend - tstart).count();
}
delete[] arr1, arr2, arr3;
ofstream outdata;
outdata.open("outdata.dat");
for (int j = 0; j < addLen; j++) {
outdata << timeStops[j] << endl;
}
outdata.close();
delete[] timeStops;
}
Cuda 代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <chrono>
#include <fstream>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
using namespace std;
int main()
{
const int addLen = 1024;
const unsigned long arraySize = 10;
double* timeStops = new double[addLen];
int* a = new int[arraySize];
int* b = new int[arraySize];
int* c = new int[arraySize];
for (int j = 0; j < addLen; j++) {
for (unsigned long i = 0; i < arraySize; i++) {
a[i] = rand() % 100;
b[i] = rand() % 100;
}
auto tstart = chrono::high_resolution_clock::now();
addWithCuda(c, a, b, arraySize);
auto tend = chrono::high_resolution_clock::now();
timeStops[j] = (tend - tstart).count();
}
cudaDeviceReset();
delete[] a, b, c;
ofstream outdata;
outdata.open("outcudadata.dat");
for (int j = 0; j < addLen; j++) {
outdata << timeStops[j] << endl;
}
outdata.close();
delete[] timeStops;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaSetDevice(0);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMalloc((void**)&dev_a, size * sizeof(int));
cudaMalloc((void**)&dev_b, size * sizeof(int));
cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
cudaDeviceSynchronize();
cudaError_t cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
答案 0 :(得分:4)
我原以为基于 cuda 的性能会更好,但事实并非如此。
这在意料之中。与内存带宽要求相比,算术强度如此低的运算在 GPU 上运行时可能无法胜过 CPU,尤其是当您在计时中包含进出 GPU 的传输时。
<块引用>cuda 中任何更长的数组都会产生 cudaError。
有 1024 threads per block 的限制。为了解决更大的问题,您需要阅读有关块和网格尺寸标注的一些资料。