我尝试使用CUBLAS库(带cublasSgeam
函数)并行化矩阵转置操作。
输出数据是正确的,但它比我的CPU版本平均多150次。为什么呢?
CPU代码(用N = 5000
转换M=140
矩阵
// Starting the timer
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
GPU代码(用N = 5000
转换M=140
矩阵
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) malloc (N * M * sizeof(float));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float)));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float)));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice));
//Starting the timer
const float alpha = 1.0;
const float beta = 0.0;
cublasHandle_t handle;
cublasCreate(&handle);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cublasDestroy(handle);
//Ending the timer
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost));
cudaFree(d_matrix);
cudaFree(d_matrixT);
经过时间
CUBLAS:148.461 ms
CPU:0.986944 ms
PS: 在GeForce GTX 660上运行&amp;英特尔酷睿i5 660
答案 0 :(得分:3)
使用profilers之一运行您的代码,以查看花费的时间。
将cublasCreate
功能移出您的时间区域。这就是拾取各种CUDA和库启动时间,这不应该包含在单个函数的基准测试中(或者如果你打算以这种方式进行基准测试,那么使用GPU执行这一单一功能显然没什么意义。正如你所发现的那样,它不会加速它。)
我还建议将cublasDestroy
移出时间循环。
您可能希望在最终时间关闭之前加入cudaDeviceSynchronize();
。
这是一个完全有效的例子,选择M = 1000和N = 1000,并执行上述更改:
$ cat t469.cu
#include <stdio.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 1000
#define M 1000
int main(){
timeval t1, t2;
float *matrix = (float *) malloc (N * M * sizeof(float));
// Starting the timer
gettimeofday(&t1, NULL);
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
gettimeofday(&t2, NULL);
float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("CPU time = %fms\n", et1);
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) (malloc (N * M * sizeof(float)));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
//Starting the timer
gettimeofday(&t1, NULL);
const float alpha = 1.0;
const float beta = 0.0;
// gettimeofday(&t1, NULL);
cublasHandle_t handle;
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cublasDestroy(handle);
//Ending the timer
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU time = %fms\n", et2);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
cudaFree(d_matrix);
cudaFree(d_matrixT);
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t469 t469.cu -lcublas
$ ./t469
CPU time = 8.744000ms
GPU time = 0.327000ms
$
如果相反,我改变上面的代码,以便在<{em> cublasCreate
电话之前离开时间函数start ,我明白了:
$ ./t469
CPU time = 9.475000ms
GPU time = 78.393997ms
$