使用相同的代码和机器(GPU除外),运行时间为:
Titan V:490ms
2080ti:380ms
我认为Titan V的双精度兼容性可能会比RTX 2080ti更好,但结果并非如此。也许我应该使用某些参数将代码编译为双精度,否则我的驱动程序是错误的?
我正在使用Ubuntu 18.04。
内核代码是:
__global__ void CSpMV_CSR(unsigned int N,
unsigned int *xadj,unsigned int *adjncy,
double *dataxx,double *datayy,double *datazz,
double *Cspin,
double *CHDemag,double *CH)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
if(i < N)
{
double dot[3]={0,0,0};
for(int n = xadj[i] ; n < xadj[i+1]; n++)
{
unsigned int neigh=adjncy[n];
double val[3] = {dataxx[n],datayy[n],datazz[n]};
for(unsigned int co = 0 ; co < 3 ; co++)
{
dot[co]+=(val[co]*Cspin[3*neigh+co]);
}
}
CH[3*i]=CHDemag[3*i]+dot[0];
CH[3*i+1]=CHDemag[3*i+1]+dot[1];
CH[3*i+2]=CHDemag[3*i+2]+dot[2];
}
}
这是我的时间测量代码:
clock_t start, end;
start = clock();
int threadsperblock,blockspergrid;
threadsperblock=256;
blockspergrid=(SIZE+threadsperblock-1)/threadsperblock;
for(int i=0;i<200;i++){
CSpMV_CSR<<<blockspergrid,threadsperblock>>>(SIZE,Cxadj,Cadjncy,Cdxx,Cdyy,Cdzz,Cspin,CHDemag,CH);
}
std::cout<<cudaDeviceSynchronize()<<std::endl;
cout<<cudaMemcpy(H,CH,3*SIZE*sizeof(double),cudaMemcpyDeviceToHost)<<endl;
end = clock();
double endtime = (double)(end - start) / CLOCKS_PER_SEC;
cout << "cost time:" << endtime * 1000 << "ms" << endl;