众所周知,将数据复制到GPU的速度很慢,我想知道将数据传递给GPU的具体“重要性”。
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
for (int i = 0; i < reps; ++i)
add_kernel<<<blocks, threads>>>(a, b, c, size);
}
将size
传递给内核会产生(重大)开销吗?或者“数据传输”更具体地涉及将大型数组从heap
复制到GPU。
IE会使这种变体(更快)
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; i < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
extern float* a; //float* of [size] allocated on the GPU
extern float* b; //float* of [size] allocated on the GPU
extern float* c; //float* of [size] allocated on the GPU
add_kernel<<<blocks, threads>>>(a, b, c, size, reps);
}
IE(再次)在“理想”的CUDA程序中,程序员应该尝试在纯粹的CUDA内核中编写大部分计算程序,或编写随后从CPU调用的CUDA内核(在传递来自CPU的值的实例中)堆栈不会产生很大的开销)。
为了清晰而编辑。
答案 0 :(得分:2)
一切都很重要。为了运行内核,CPU需要以某种方式传递要调用的内核以及使用哪些参数。在&#34;微观级别&#34;,如果你的内核只执行几个操作,这些都是可观的费用。在现实生活中,如果你的内核做了很多工作,那么它们是可以忽略不计的。
如果这些小型操作没有流水线,那么相对较大的服务费用就可以了。您可以在NVidia的Visual Profiler中看到这一点。我不知道/记住确切的数字,但订单正在跟进。 CPU和GPU之间的带宽可以是1 GB / s,因此1字节/纳秒。但实际上发送4个字节的数据包并获得确认将需要1微秒。所以要发送10000个字节 - 比如11微秒。
对GPU的大规模执行也优化了操作的执行,因此使用一个32个线程的warp执行10次连续操作可能需要200个GPU时钟周期(如0.2微秒)。并说0.5微秒用于在启动之前发送内核执行命令。
在现实生活中,问题通常在于,由于带宽限制,您需要花费0.4秒来计算1亿个数字,并且计算本身需要0.1微秒。因为顶级GPU可以在接近1纳秒的每个周期中执行大约1000次操作。
答案 1 :(得分:0)
您好我已对这两个版本进行了基准测试。简单地调用CUDA函数会产生明显的开销
这是输出 -
Calculating... (BlackCat_Tensors) reps outside
It took me 27.359249 clicks (27.359249 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 10.855168 clicks (10.855168 seconds).
这是我的基准 -
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 10000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */
答案 2 :(得分:0)
这是次要基准: 我认为内部循环的线程可能会更高,因为它计算得更多,而且ergo应该在性能上有更大的差异。
/*
* test_area.cu
*
* Created on: Jan 11, 2018
* Author: joseph
*/
#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_
#include <omp.h>
#include <stdio.h>
int threads() {
return 256;
}
int blocks(int size) {
return (size + threads() - 1) / threads();
}
__global__
void add_kernel(float* a, float* b, float* c, int size) {
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; j < reps; ++j)
for (int i = 0; i < size; ++i) {
a[i] = b[i] + c[i];
}
}
int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number
float* a; //float* of [size] allocated on the GPU
float* b; //float* of [size] allocated on the GPU
float* c; //flo
cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);
float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");
for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
#endif /* TEST_AREA_CU_ */
Calculating... (BlackCat_Tensors) reps outside
It took me 14.969501 clicks (14.969501 seconds).
Calculating... (BlackCat_Tensors) reps inside
It took me 13.060688 clicks (13.060688 seconds).