假设我们在主机端使用了四个float
数组,以及在设备端使用的四个数组:
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
现在假设我们有一个非常简单的内核,取自NVIDIA博客上的一个例子:
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
这样的内核将由for-loop内的主机端调用,如下所示:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
然后我将这种循环的执行时间与其纯CPU版本进行比较:
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
现在,我不明白的是以下内容。例如,对于ARRAYS_SIZE = 1000000
和LOOP_N = 1000
,当我在上面显示的版本中运行两个循环时,我得到CPU版本和CUDA版本的执行时间之间的比率大约是6.它是, CUDA版本大约快6倍。
但是,如果我注释掉CUDA版本循环中的saxpy
之一的调用以及循环CPU版本中的一个计算,则CPU和CUDA之间的比率大约为210是的,CUDA版本大约快210倍。
如果没有内存传输到设备或从设备传输内存,只重复调用内核时,这种性能损失的技术原因是什么?这有什么变通方法吗?
一个(希望)完全可重现的代码示例如下:
#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;
//Pretty simple kernel, from the example in Nvidia's blog
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}
// Main loop
int main(void)
{
timers t0, t1, t2;
timers tfinal0, tfinal1, tfinal2;
float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];
//Initializing arrays at the host side:
for (int i = 0; i < ARRAYS_SIZE; i++) {
x[i] = 1.0f;
x2[i] = 1.0f;
y[i] = 2.0f;
y2[i] = 2.0f;
}
// GPU memory allocation:
cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));
// Transfering arrays from host to device:
cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
//////////////////
// CPU run //
//////////////////
t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
//comment one of the following out to see the point of my question:
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
tfinal0 = std::chrono::high_resolution_clock::now();
auto time0 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal0 - t0).count();
std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;
//////////////////
// GPU-CUDA run //
//////////////////
// Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
t1 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
//comment one of the following out to see the point of my question:
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
tfinal1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal1 - t1).count();
std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;
//Display performance ratio CPU / GPU-CUDA
std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;
//Freeing memory used by arrays:
cudaFree(d_x);
cudaFree(d_x2);
cudaFree(d_y);
cudaFree(d_y2);
free(x);
free(x2);
free(y);
free(y2);
return 0;
}
答案 0 :(得分:3)
您不是在等待内核完成。与所有kernel launches are asynchronous一样,您需要在停止计时器之前明确调用cudaDeviceSynchronize()
。
您使用当前代码的变体观察到的差异可能源于启动内核的队列是有限的,因此在某些时候您的代码将开始等待部分您的内核无论如何。 在Windows上,内核批处理也会发挥作用,一些数字(或超时)驱动程序甚至不会开始启动内核。
答案 1 :(得分:0)
一个简单的改变解决了这个问题,但我仍然非常希望了解所有这些的技术原因。
解决方案是在上面的玩具示例中仅将内核更改为:
__global__
void saxpy(int n, float a, float *x, float *y, float *x2, float *y2)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
y2[i] = a*x2[i] + y2[i];
}
}
然后只调用一次,如下所示:
for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y, d_x2, d_y2);
}
现在,与CPU实现的性能差异是相同的 - 应该是预期的。
如果有人可以回答为什么这会产生影响,请发帖说明我会支持我的。