Question

使用cublasDgemm时遇到问题（此函数位于cublas中，结果为A * B，A = 750 * 600，B = 600 * 1000）。

for (i=0; i < N; ++i) {
    cublasDgemm();
}

    N=10, total time is 0.000473s, average call is 0.0000473
    N=100, total time is 0.00243s, average call is 0.0000243
    N=1000, total time is 0.715072s, average call is 0.000715
    N=10000, total time is 10.4998s, average call is 0.00104998

为什么平均时间会增加这么多？

#include <cuda_runtime.h>
#include <string.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

using namespace std;

#define  IDX2C(i,j,leading) (((j)*(leading))+(i))

#define CHECK_EQ(a,b) do { \
	if ((a) != (b)) { \ 
		cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
		exit(1);\
	}\
} while(0)


#define CUBLAS_CHECK(condition) \
do {\
	cublasStatus_t status = condition; \
	CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)

#define CUDA_CHECK(condition)\
do {\
	cudaError_t error = condition;\
	CHECK_EQ(error, cudaSuccess);\
} while(0)

//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())

template <class T>
void randMtx(T *mat, int n, double range) {
	srand((unsigned int)time(NULL));
	
	for (int i = 0; i < n; ++i) {
		//mat[i] = 1.0;
		double flag = 1.0;
		if (rand() % 2 == 0) flag = -1.0;
		mat[i] = flag * rand()/RAND_MAX * range;
	}
}



int main(int argc, char *argv[]) {
    if (argc != 9) {
        cout << "m1_row m1_col m2_row m2_col m1 m2 count range\n";
        return -1;
    }

    int row1 = atoi(argv[1]);
    int col1 = atoi(argv[2]);
    int row2 = atoi(argv[3]);
    int col2 = atoi(argv[4]);
    int count = atoi(argv[7]);
	double range = atof(argv[8]);
    cublasOperation_t opt1 = CUBLAS_OP_N;
    cublasOperation_t opt2 = CUBLAS_OP_N;
	int row3 = row1;
	int col3 = col2;
	int k = col1;
    if (argv[5][0] == 't') {
		opt1 =  CUBLAS_OP_T;
		row3 = col1;
		k = row1;
	}
    if (argv[6][0] == 't') {
		opt2 =  CUBLAS_OP_T;
		col3 = row2;
	}

	double *mat1_c = (double*)malloc(sizeof(double)*row1*col1);
	double *mat2_c = (double*)malloc(sizeof(double)*row2*col2);
	double *mat3_c = (double*)malloc(sizeof(double)*row3*col3);
	srand((unsigned int)time(NULL));
	
	randMtx(mat1_c, row1*col1, range);
	randMtx(mat2_c, row2*col2, range);
	
	double *mat1_g;
	double *mat2_g;
	double *mat3_g;
	double alpha = 1.0;
	double beta = 0.0;
	
	CUDA_CHECK(cudaMalloc((void **)&(mat1_g), sizeof(double)*row1*col1));
	CUDA_CHECK(cudaMalloc((void **)&(mat2_g), sizeof(double)*row2*col2));
	CUDA_CHECK(cudaMalloc((void **)&(mat3_g), sizeof(double)*row3*col3));
	
	CUDA_CHECK(cudaMemcpy(mat1_g, mat1_c, sizeof(double)*row1*col1, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(mat2_g, mat2_c, sizeof(double)*row2*col2, cudaMemcpyHostToDevice));
	
    cublasHandle_t handle;
    CUBLAS_CHECK(cublasCreate(&handle));
    
	struct timeval beg, end, b1, e1;
	gettimeofday(&beg, NULL);
	for (int i = 0; i < count ;++i) {
		
		CUBLAS_CHECK(cublasDgemm(handle, opt1, opt2, row3, col3, k, &alpha, mat1_g, row1, mat2_g, row2, &beta, mat3_g, row3));
		
	}
	cudaDeviceSynchronize();//
	gettimeofday(&end, NULL);
	cout << "real time used: " << end.tv_sec-beg.tv_sec + (double)(end.tv_usec-beg.tv_usec)/1000000 <<endl;
	
	free(mat1_c);
	free(mat2_c);
	free(mat3_c);
	
	cudaFree(mat1_g);
	cudaFree(mat2_g);
	cudaFree(mat3_g);
    return 1;
}

这是代码。我在循环块之后添加cudaDeviceSynchronize，无论count的值，平均调用时间大约为0.001s

Answer 1

正如@talonmies所指出的，这种行为可能正是预期的结果。

当你调用cublasDgemm时，调用（通常）将控制返回到主机（CPU）线程，然后操作完成。事实上，每次拨打电话时都会有一个类似这样的呼叫的队列。该操作将被放入队列中，您的主机代码将继续。

此外，CUDA和CUBLAS通常会有一些与使用API相关的一次性开销。例如，创建CUBLAS句柄的调用通常会产生一些可测量的时间，以便初始化库。

因此，您的测量可以分为3组：

“小”迭代计数（例如10）。在这种情况下，每次调用都会支付将Dgemm请求放入队列的成本，以及在相对较少的迭代次数内的启动成本摊销。这对应于您的测量值：“平均值为0.0000473”
“中等”迭代计数（例如100-1000）。在这种情况下，每次调用的启动成本的摊销变得非常小，因此大多数测量只是将Dgemm请求添加到队列的时间。这对应于您的测量值：“平均值为0.0000243”
“大”迭代计数（例如10000）。在某些时候，内部请求队列变满，并且不再接受新请求，直到某些请求已完成并从队列中删除。此时发生的事情是Dgemm调用从非阻塞切换到阻塞。它阻塞（保持主机/ CPU线程），直到队列插槽可用。此时发生的情况是，突然新请求必须有效地等待先前的请求完成，因此现在新Dgemm请求的成本大约等于执行和完成（先前）Dgemm请求的时间。因此，每次呼叫成本从将项目添加到队列的成本急剧上升到完成请求的成本。这对应于您的测量值：“平均值为0.00104998”

cublasDgemm越来越慢

1 个答案: