我正在使用CUDA在GPU(Fermi)上进行有限差分计算(模板计算)。当我使用CUDA探查器测试我的代码时,我发现占用是0.333
。在我命令计算并将占用增加到0.677
之后,内核的执行时间没有减少但是增加了。换句话说,当占用率增加1/3
时,性能会下降。
我的问题是:
无论占用率如何,内核的性能是否依赖于计算?
答案 0 :(得分:4)
答案是“它取决于”,包括工作负载的特性以及如何定义性能。一般来说,如果你的瓶颈是数学吞吐量,你通常可以选择较低的入住率(12.5%-33%),但如果你的瓶颈是内存,那么你通常需要更高的占用率(66%或更高)。这只是一个经验法则,而不是绝对的规则。大多数内核都位于中间位置,但两种极端情况都有例外。
占用率是一次可以激活的内核的最大线程数(受每个线程或其他资源的寄存器数限制)除以GPU在不受其他资源限制时可以激活的最大线程数。活动意味着线程已分配硬件资源并可用于调度,而不是它在给定时钟周期内执行任何指令。
在为一个线程发出指令 i 之后,该线程的指令 i + 1 可能无法立即运行,如果它取决于指令的结果< EM> I 的。如果该指令是数学指令,则结果将在几个时钟周期内可用。如果是内存加载指令,则可能是100个周期。而不是等待,GPU将发出来自其他依赖性满足的其他线程的指令。
因此,如果你主要是做数学运算,那么你只需要一些(用GPU术语很少;在CPU上就可以认为很多)线程来隐藏数学指令的几个延迟周期,这样你就可以逃脱入住率低。但是如果你有大量的内存流量,你需要更多的线程来确保它们中的一些准备好在每个周期执行,因为每个周期花费大量时间“休眠”等待内存操作完成。
如果您为增加占用率所做的算法更改也增加了每个线程的工作量,如果您已经有足够的线程来保持GPU忙,那么更改只会减慢您的速度。增加占用率只会提高性能,直到你有足够的线程来保持GPU忙碌。
答案 1 :(得分:1)
为了最大化算法性能,占用率并不是唯一值得关注的因素,而算法性能通常与执行时间一致。我建议看一下瓦西里沃尔科夫的GTC2010指导性演讲:
Better Performance at Lower Occupancy
下面,我提供了一个简单的例子,受到上述演示文稿第二部分的启发。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BLOCKSIZE 512
//#define DEBUG
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************************/
/* MEMCPY1 - EACH THREAD COPIES ONE FLOAT ONLY */
/***********************************************/
__global__ void memcpy1(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a0 = src[tid];
dst[tid] = a0;
}
}
/*******************************************/
/* MEMCPY2 - EACH THREAD COPIES TWO FLOATS */
/*******************************************/
__global__ void memcpy2(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
}
}
/********************************************/
/* MEMCPY4 - EACH THREAD COPIES FOUR FLOATS */
/********************************************/
__global__ void memcpy4(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
float a2 = src[tid + 2 * blockDim.x];
float a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/***********************************************/
/* MEMCPY4_2 - EACH THREAD COPIES FOUR FLOATS2 */
/***********************************************/
__global__ void memcpy4_2(float2 *src, float2 *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N/2) {
float2 a0 = src[tid];
float2 a1 = src[tid + blockDim.x];
float2 a2 = src[tid + 2 * blockDim.x];
float2 a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/********/
/* MAIN */
/********/
void main()
{
const int N = 131072;
const int N_iter = 20;
// --- Setting host data and memory space for result
float* h_vect = (float*)malloc(N*sizeof(float));
float* h_result = (float*)malloc(N*sizeof(float));
for (int i=0; i<N; i++) h_vect[i] = i;
// --- Setting device data and memory space for result
float* d_src; gpuErrchk(cudaMalloc((void**)&d_src, N*sizeof(float)));
float* d_dest1; gpuErrchk(cudaMalloc((void**)&d_dest1, N*sizeof(float)));
float* d_dest2; gpuErrchk(cudaMalloc((void**)&d_dest2, N*sizeof(float)));
float* d_dest4; gpuErrchk(cudaMalloc((void**)&d_dest4, N*sizeof(float)));
float* d_dest4_2; gpuErrchk(cudaMalloc((void**)&d_dest4_2, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_src, h_vect, N*sizeof(float), cudaMemcpyHostToDevice));
// --- Warmup
for (int i=0; i<N_iter; i++) memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* MEMCPY1 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest1, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY2 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy2<<<iDivUp(N/2,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY4 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4<<<iDivUp(N/4,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest4, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/*************/
/* MEMCPY4_2 */
/*************/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4_2<<<iDivUp(N/8,BLOCKSIZE), BLOCKSIZE>>>((float2*)d_src, (float2*)d_dest4_2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4_2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
cudaDeviceReset();
}
下面是在GeForce GT540M和Kepler K20c上运行时上述代码的性能。
BLOCKSIZE 32
GT540M K20c Tesla C2050
memcpy1 2.3GB/s 13% 28.1GB/s 18% 14.9GB/s 12%
memcpy2 4.4GB/s 13% 41.1GB/s 18% 24.8GB/s 13%
memcpy4 7.5GB/s 13% 54.8GB/s 18% 34.6GB/s 13%
memcpy4_2 11.2GB/2 14% 68.8GB/s 18% 44.0GB7s 14%
BLOCKSIZE 64
GT540M K20c Tesla C2050
memcpy1 4.6GB/s 27% 44.1GB/s 36% 26.1GB/s 26%
memcpy2 8.1GB/s 27% 57.1GB/s 36% 35.7GB/s 26%
memcpy4 11.4GB/s 27% 63.2GB/s 36% 43.5GB/s 26%
memcpy4_2 12.6GB/s 27% 72.8GB/s 36% 49.7GB/s 27%
BLOCKSIZE 128
GT540M K20c Tesla C2050
memcpy1 8.0GB/s 52% 60.6GB/s 78% 36.1GB/s 52%
memcpy2 11.6GB/2 52% 61.6GB/s 78% 44.8GB/s 52%
memcpy4 12.4GB/2 52% 62.2GB/s 78% 48.3GB/s 52%
memcpy4_2 12.5GB/s 52% 61.9GB/s 78% 49.5GB7s 52%
BLOCKSIZE 256
GT540M K20c Tesla C2050
memcpy1 10.6GB/s 80% 61.2GB/s 74% 42.0GB/s 77%
memcpy2 12.3GB/s 80% 66.2GB/s 74% 48.2GB/s 77%
memcpy4 12.4GB/s 80% 66.4GB/s 74% 45.5GB/s 77%
memcpy4_2 12.6GB/s 70% 72.6GB/s 74% 50.8GB/s 77%
BLOCKSIZE 512
GT540M K20c Tesla C2050
memcpy1 10.3GB/s 80% 54.5GB/s 75% 41.6GB/s 75%
memcpy2 12.2GB/s 80% 67.1GB/s 75% 47.7GB/s 75%
memcpy4 12.4GB/s 80% 67.9GB/s 75% 46.9GB/s 75%
memcpy4_2 12.5GB/s 55% 70.1GB/s 75% 48.3GB/s 75%
上述结果表明,如果您正确利用指令级并行(ILP),您可以获得更好的性能,即12GB/s
用于GT540M情况,占用率较低,即27%
< / em>通过为每个线程提供更多工作来隐藏延迟。