Cuda Memory Transfer开销

时间:2018-01-11 07:31:12

标签: c++ optimization cuda gpu

众所周知,将数据复制到GPU的速度很慢,我想知道将数据传递给GPU的具体“重要性”。

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }

int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000;   //Or any arbitrarily large number


extern float* a; //float* of [size] allocated on the GPU 
extern float* b; //float* of [size] allocated on the GPU 
extern float* c; //float* of [size] allocated on the GPU 

 for (int i = 0; i < reps; ++i)
add_kernel<<<blocks, threads>>>(a, b, c, size); 

}

size传递给内核会产生(重大)开销吗?或者“数据传输”更具体地涉及将大型数组从heap复制到GPU。

IE会使这种变体(更快)

__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
for (int j = 0; i < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }

int main() {
int size = 100000; //Or any arbitrarily large number
int reps = 1000; //Or any arbitrarily large number

extern float* a; //float* of [size] allocated on the GPU 
extern float* b; //float* of [size] allocated on the GPU 
extern float* c; //float* of [size] allocated on the GPU 

add_kernel<<<blocks, threads>>>(a, b, c, size, reps); 
}

IE(再次)在“理想”的CUDA程序中,程序员应该尝试在纯粹的CUDA内核中编写大部分计算程序,或编写随后从CPU调用的CUDA内核(在传递来自CPU的值的实例中)堆栈不会产生很大的开销)。

为了清晰而编辑。

3 个答案:

答案 0 :(得分:2)

一切都很重要。为了运行内核,CPU需要以某种方式传递要调用的内核以及使用哪些参数。在&#34;微观级别&#34;,如果你的内核只执行几个操作,这些都是可观的费用。在现实生活中,如果你的内核做了很多工作,那么它们是可以忽略不计的。

如果这些小型操作没有流水线,那么相对较大的服务费用就可以了。您可以在NVidia的Visual Profiler中看到这一点。我不知道/记住确切的数字,但订单正在跟进。 CPU和GPU之间的带宽可以是1 GB / s,因此1字节/纳秒。但实际上发送4个字节的数据包并获得确认将需要1微秒。所以要发送10000个字节 - 比如11微秒。

对GPU的大规模执行也优化了操作的执行,因此使用一个32个线程的warp执行10次连续操作可能需要200个GPU时钟周期(如0.2微秒)。并说0.5微秒用于在启动之前发送内核执行命令。

在现实生活中,问题通常在于,由于带宽限制,您需要花费0.4秒来计算1亿个数字,并且计算本身需要0.1微秒。因为顶级GPU可以在接近1纳秒的每个周期中执行大约1000次操作。

答案 1 :(得分:0)

您好我已对这两个版本进行了基准测试。简单地调用CUDA函数会产生明显的开销

这是输出 -

 Calculating... (BlackCat_Tensors) reps outside
It took me 27.359249 clicks (27.359249 seconds).

 Calculating... (BlackCat_Tensors) reps inside 
It took me 10.855168 clicks (10.855168 seconds).

这是我的基准 -

/*
 * test_area.cu
 *
 *  Created on: Jan 11, 2018
 *      Author: joseph
 */

#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_

#include <omp.h>
#include <stdio.h>


int threads() {
    return 256;
}
int blocks(int size) {
    return (size + threads() - 1) / threads();
}

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}


__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
    for (int j = 0; j < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}

int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 10000;   //Or any arbitrarily large number

float* a; //float* of [size] allocated on the GPU
 float* b; //float* of [size] allocated on the GPU
 float* c; //flo

cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);


float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");

for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));


 t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");

add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();


t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));





 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
}


#endif /* TEST_AREA_CU_ */

答案 2 :(得分:0)

这是次要基准: 我认为内部循环的线程可能会更高,因为它计算得更多,而且ergo应该在性能上有更大的差异。

/*
 * test_area.cu
 *
 *  Created on: Jan 11, 2018
 *      Author: joseph
 */

#ifndef TEST_AREA_CU_
#define TEST_AREA_CU_

#include <omp.h>
#include <stdio.h>


int threads() {
    return 256;
}
int blocks(int size) {
    return (size + threads() - 1) / threads();
}

__global__
void add_kernel(float* a, float* b, float* c, int size) {
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}


__global__
void add_kernel(float* a, float* b, float* c, int size, int reps) {
    for (int j = 0; j < reps; ++j)
   for (int i = 0; i < size; ++i) {
       a[i] = b[i] + c[i];
   }
}

int main() {
int sz = 10000; //Or any arbitrarily large number
int reps = 1000;   //Or any arbitrarily large number

float* a; //float* of [size] allocated on the GPU
 float* b; //float* of [size] allocated on the GPU
 float* c; //flo

cudaMalloc((void**)&a, sizeof(float) * sz);
cudaMalloc((void**)&b, sizeof(float) * sz);
cudaMalloc((void**)&c, sizeof(float) * sz);


float t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps outside\n");

for (int i = 0; i < reps; ++i) {
add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz);
cudaDeviceSynchronize();
}
t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));


 t = omp_get_wtime();
printf("\n Calculating... (BlackCat_Tensors) reps inside \n");

add_kernel<<<blocks(sz), threads()>>>(a, b, c, sz, reps);
cudaDeviceSynchronize();


t = omp_get_wtime() - t;
printf("It took me %f clicks (%f seconds).\n", t, ((float) t));





 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
}


#endif /* TEST_AREA_CU_ */



 Calculating... (BlackCat_Tensors) reps outside
It took me 14.969501 clicks (14.969501 seconds).

 Calculating... (BlackCat_Tensors) reps inside 
It took me 13.060688 clicks (13.060688 seconds).