我正在使用CUSPARSE库中的专用求解器cusparseDgtsv()
求解线性方程组,发现它不会产生加速度。
我尝试在以下位置运行测试:
我用nvcc -lcusparse main.cu -o dgtsv.app -gencode arch=compute_35,code=sm_35
为 Tesla K40s 编译的下一个测试代码。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#include <cuda_runtime.h>
#include <cusparse.h>
void run_Dgtsv_test(int n);
void run_Managed_Dgtsv_test(int n);
int tridiagonalmatrixalgorithm(double* a, double* b, double* c, double* d, double** X, int size);
int main(int argc, char** argv) {
int base = 2;
int max_size = (INT_MAX/16)/(sizeof(double));
printf("Dgtsv speed test\n SLAE size, Dgtsv time, Serial \n");
for (int n = 512; n < max_size; n*=base)
run_Dgtsv_test(n);
printf("####################### Testing MANAGED Malloc ###########################\n");
printf("Managed malloc Dgtsv speed test\n SLAE size, Dgtsv time, Serial \n");
for (int n = 512; n < max_size; n*=base)
run_Managed_Dgtsv_test(n);
}
/**
* run cuSPARSE solver for tridiagonal matrix
*/
void run_Dgtsv_test(int n) {
// {{{
cusparseHandle_t cusparseHandle;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
double* a_dev;
double* b_dev;
double* c_dev;
double* d_dev;
double* a;
double* b;
double* c;
double* d;
double* tmp;
int num_of_tests = 50;
a = new double[n]; // lower diagonal
b = new double[n]; // main diagonal
c = new double[n]; // upper diagonal
d = new double[n]; // right handside part
tmp = new double[n];
for (int i=0; i < n; i++) {
a[i] = 0;
b[i] = 0;
c[i] = 0;
d[i] = 0;
tmp[i] = i;
}
for (int i=0; i < n; i++) {
b[i] = 4*i;
d[i] = 8*i;
}
for (int i=1; i < n-1; i++) {
a[i] = i;
c[i] = 2*i;
}
size_t size = sizeof(double) * n;
(cudaMalloc( (void**)& a_dev , size) );
(cudaMalloc( (void**)& b_dev , size) );
(cudaMalloc( (void**)& c_dev , size) );
(cudaMalloc( (void**)& d_dev , size) );
cudaMemcpy( a_dev, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( b_dev, b, size, cudaMemcpyHostToDevice);
cudaMemcpy( c_dev, c, size, cudaMemcpyHostToDevice);
cudaMemcpy( d_dev, d, size, cudaMemcpyHostToDevice);
/* ------------------------------ cuSPARSE TEST ---------------------------------------------- */
clock_t t1 = clock();
for (int i=0; i < num_of_tests; i++) {
cusparseStatus = cusparseDgtsv(cusparseHandle, n, 1, a_dev,b_dev,c_dev,d_dev, n);
if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) {
fprintf(stderr,"Failed to perform CUSPARSE Dgtsv: int Dgtsv test\n");
throw;
}
cudaDeviceSynchronize();
}
clock_t t2 = clock();
/* ----------------------------------- serial test ------------------------------------------- */
for (int i=0; i < num_of_tests; i++) {
tridiagonalmatrixalgorithm(a,b,c,d,&tmp,n);
}
clock_t t3 = clock();
double time1 = ((double) (t2 - t1)) / CLOCKS_PER_SEC / num_of_tests;
double time2 = ((double) (t3 - t2)) / CLOCKS_PER_SEC / num_of_tests;
printf("(%9.d, %9.6f, %9.6f) \n",n,time1,time2);
(cudaFree( a_dev ) );
(cudaFree( b_dev ) );
(cudaFree( c_dev ) );
(cudaFree( d_dev ) );
delete[] (a);
delete[] (b);
delete[] (c);
delete[] (d);
cusparseDestroy(cusparseHandle);
}
// }}}
/**
* run cuSPARSE solver for tridiagonal matrix
*/
void run_Managed_Dgtsv_test(int n) {
// {{{
cusparseHandle_t cusparseHandle;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
double* a_dev;
double* b_dev;
double* c_dev;
double* d_dev;
double* a;
double* b;
double* c;
double* d;
double* tmp;
int num_of_tests = 50;
a = new double[n];
b = new double[n];
c = new double[n];
d = new double[n];
tmp = new double[n];
for (int i=0; i < n; i++) {
a[i] = 0;
b[i] = 0;
c[i] = 0;
d[i] = 0;
tmp[i] = i;
}
for (int i=0; i < n; i++) {
b[i] = 4*i;
d[i] = 8*i;
}
for (int i=1; i < n-1; i++) {
a[i] = i;
c[i] = 2*i;
}
size_t size = sizeof(double) * n;
(cudaMallocManaged( (void**)& a_dev , size) );
(cudaMallocManaged( (void**)& b_dev , size) );
(cudaMallocManaged( (void**)& c_dev , size) );
(cudaMallocManaged( (void**)& d_dev , size) );
cudaMemcpy( (void **) &a_dev, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( (void **) &b_dev, b, size, cudaMemcpyHostToDevice);
cudaMemcpy( (void **) &c_dev, c, size, cudaMemcpyHostToDevice);
cudaMemcpy( (void **) &d_dev, d, size, cudaMemcpyHostToDevice);
/* ------------------------------ cuSPARSE TEST ---------------------------------------------- */
clock_t t1 = clock();
for (int i=0; i < num_of_tests; i++) {
cusparseStatus = cusparseDgtsv(cusparseHandle, n, 1, a_dev,b_dev,c_dev,d_dev, n);
if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) {
fprintf(stderr,"Failed to perform CUSPARSE Dgtsv: int Dgtsv test\n");
throw;
}
cudaDeviceSynchronize();
}
clock_t t2 = clock();
/* ----------------------------------- serial test ------------------------------------------- */
for (int i=0; i < num_of_tests; i++) {
tridiagonalmatrixalgorithm(a,b,c,d,&tmp,n);
}
clock_t t3 = clock();
double time1 = ((double) (t2 - t1)) / CLOCKS_PER_SEC / num_of_tests;
double time2 = ((double) (t3 - t2)) / CLOCKS_PER_SEC / num_of_tests;
printf("(%9.d, %9.6f, %9.6f) \n",n,time1,time2);
(cudaFree( a_dev ) );
(cudaFree( b_dev ) );
(cudaFree( c_dev ) );
(cudaFree( d_dev ) );
delete[] (a);
delete[] (b);
delete[] (c);
delete[] (d);
cusparseDestroy(cusparseHandle);
}
// }}}
/** Serial tridiagonal solver
*
* Tridiagonal algorithm, same array size for all input vars.
* @var X, @var a, @var b, @var should be size of @var n
* **************************************************************************************************
* **** Warning! b[0] and c[n-1] is a trash element, but they should be for array size constancy ****
* **************************************************************************************************
* @input double** X -- pointer to array where the answer will be written
*
* ⬇︎ b[0]
* | a c 0 0 0 0 ... 0 0 0 | | x | = | d |
* | b a c 0 0 0 ... 0 0 0 | | x | = | d |
* | 0 b a c 0 0 ... 0 0 0 | | x | = | d |
* | 0 0 b a c 0 ... 0 0 0 | | x | = | d |
* | : : | | x | = | d |
* | : : | | x | = | d |
* | 0 ... b a c | | x | = | d |
* | 0 ... 0 b a | | x | = | d |
* ⬆︎ c[n-1]
*/
int tridiagonalmatrixalgorithm(double* a, double* b, double* c, double* d, double** X, int size) {
// {{{
double* x = (double*) calloc(size,sizeof(double));
double* v = (double*) calloc(size,sizeof(double));
double w;
w = a[0];
x[0] = d[0]/w;
for (int i=1; i<size; i++) {
v[i-1] = c[i-1]/w;
w = a[i] - b[i]*v[i-1];
x[i] = (d[i] - b[i]*x[i-1])/w;
}
for (int j=size-2; j >= 0; j--)
x[j] = x[j] - v[j] * x[j+1];
for (int i=0; i < size; i++)
(*X)[i] = x[i];
free(v);
free(x);
return 0;
}
// }}}
这给了我下一个输出:
Dgtsv speed test
SLAE size, Dgtsv time, Serial
( 512, 0.000600, 0.000200)
( 1024, 0.001000, 0.000000)
( 2048, 0.000800, 0.000200)
( 4096, 0.001000, 0.000200)
( 8192, 0.000800, 0.000200)
( 16384, 0.001000, 0.000200)
( 32768, 0.001600, 0.000600)
( 65536, 0.001800, 0.001400)
( 131072, 0.002000, 0.002600)
( 262144, 0.003000, 0.005400)
( 524288, 0.005000, 0.011600)
( 1048576, 0.008400, 0.023000)
( 2097152, 0.015800, 0.045400)
( 4194304, 0.030400, 0.090200)
( 8388608, 0.059600, 0.192800)
####################### Testing MANAGED Malloc ###########################
Managed malloc Dgtsv speed test
SLAE size, Dgtsv time, Serial
( 512, 0.001000, 0.000000)
( 1024, 0.001400, 0.000000)
( 2048, 0.001200, 0.000000)
( 4096, 0.001400, 0.000000)
( 8192, 0.001200, 0.000200)
( 16384, 0.001200, 0.000400)
( 32768, 0.001400, 0.000600)
( 65536, 0.001800, 0.001400)
( 131072, 0.002400, 0.002600)
( 262144, 0.003000, 0.005600)
( 524288, 0.004800, 0.011000)
( 1048576, 0.008600, 0.022000)
( 2097152, 0.015800, 0.045800)
( 4194304, 0.030600, 0.091200)
( 8388608, 0.059800, 0.179600)
编译器版本
$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Tue_Jan_10_13:22:03_CST_2017
Cuda compilation tools, release 8.0, V8.0.61
$ g++ --version
g++ (GCC) 4.8.5 20150623 (Red Hat 4.8.5-4)
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
我缺少什么? 谢谢!