这让我发疯了。我有一个1D块的3D网格。每个块包含272个线程。每个线程都执行两个向量的点积,并将其结果存储在共享内存数组中的相应位置,该数组的大小为[272],这是相同数量的线程。主线程正在调用多个内核,我正在累计执行它们所花费的时间。当我注释掉写入共享内存的行时,执行时间大约为2,401 ms。当我取消注释共享内存写入行时,我得到的时间非常长,如450,309 ms。我尝试使用int值而不是double。我还尝试使用if(threadIdx.x == 0)语句让一个线程执行写操作,以避免可能的库冲突。似乎没什么用。 这是调用线程代码:
double theta=0;
int count=0;
cudaEventRecord(start,0);
while(theta <180)
{
theta+=0.18;
calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
count++;
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsedTime, start, stop );
err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
else
{
fprintf( stderr, "\n \n Cuda NO error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
printf("\n %d orientation updates: Total Time = %3.10f ms\n", count, elapsedTime);
}
有问题的内核是calc_S内核,其代码为:
__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double G[H];
int myTRA[W];
int tx= threadIdx.x;
for(int j=0; j<W; j++)
{
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * LF[j];
}
// Write your sum to shared memory
G[threadIdx.x]=sum;
//__syncthreads();
}
我正在使用带有CUDA 4.2的MS Visual Studio 2008和计算能力2.0的GPU(即GeForce GTX 580)。 笔记: 每块272个线程。 H / W螺纹限制:1,536/272 =最多5个程序段 共享内存限制:G [272] of doublebles = 2,176字节需要。 48K / 2176 =最多22个块(这将永远不会发生,但我们知道共享内存没有限制) 寄存器根本不是问题。 因此,应该可以同时执行5个块。
感谢您的帮助。
麦
编辑:
这是整个代码的缩短版本。整个代码可以在MatrixMul Nvidia SDK示例中运行。
在文件“MatrixMul.cu”
中 int main(int argc, char** argv)
{
// reading data from Matlab into double arrays
//CUDA begins here:
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
{
cutilDeviceInit(argc, argv);
}
else
{
cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
}
int devID;
cudaDeviceProp props;
// get GPU props
cutilSafeCall(cudaGetDevice(&devID));
cutilSafeCall(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);
//Declare Device memory for matrices read from Matlab
double *X_dev; // size 19 x 1
double *Y_dev; // size 19 x 1
double *Z_dev; // size 17 x 1
double *r_vD; // size 544 x 3
double *g_oriD; // size 544 x 3
double *g_traD; // size 272 x 544
double *cov_D; // size 272 x 272
double *cov_i_D; // size 272 x 272
err= cudaMalloc((void**)&X_dev, sizeX*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&Y_dev, sizeY*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&Z_dev, sizeZ*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&r_vD, sizeR_V*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&g_oriD, sizeG_ori*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&g_traD, sizeG_tra*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&cov_D, sizeCov*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&cov_i_D, sizeCov_i*sizeof(double));
errorCheck(err);
//Transfer Xs, Ys, and Zs to GPU Global memory
cudaMemcpy(X_dev,dipole_x_coords, sizeX*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(Y_dev,dipole_y_coords, sizeY*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(Z_dev,dipole_z_coords, sizeZ*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
// Transfer r_v, g_ori, and g_tra to GPU memory
cudaMemcpy(r_vD, r_v, sizeR_V*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(g_oriD,g_ori, sizeG_ori*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(g_traD,g_tra, sizeG_tra*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
// Transfer cov, and cov_i to GPU memory
cudaMemcpy(cov_D, cov_post, sizeCov*sizeof(double), cudaMemcpyHostToDevice);
errorCheck(err);
cudaMemcpy(cov_i_D,cov_post_i, sizeCov_i*sizeof(double), cudaMemcpyHostToDevice);
//Specify dimensions of block and grid
dim3 gridDim(sizeX, sizeY, sizeZ); // 19 x 19 x 17
int numThreads=(int) sizeR_V/3; // numThreads = 544
dim3 blockDim(numThreads,1,1); // 544 x 1 x 1
//call Cuda wrapper
float cf = runB(X_dev, Y_dev, Z_dev, r_vD, g_oriD, g_traD, cov_i_D, cov_D, blockDim, gridDim, sizeG_tra, tra_W, tra_H);
int c=0;
scanf("%d", c);
return 0;
}
float runB(double* X_dev, double* Y_dev, double* Z_dev,
double* r_vD, double* g_oriD, double* g_traD, double* Ci, double* C,
dim3 blockDim, dim3 gridDim, int sizeG_tra, int tra_W, int tra_H)
{
cudaError err;
// Calculate the size of thread output in global memory
size_t size_F = gridDim.x * gridDim.y * gridDim.z * blockDim.x;
size_t size_F_grad = gridDim.x * gridDim.y * gridDim.z * blockDim.x * 3;
// Make global memory space for F and F_grad
double* F_dev;
double* F_grad_dev;
err= cudaMalloc((void**)&F_dev, size_F*sizeof(double));
errorCheck(err);
err= cudaMalloc((void**)&F_grad_dev, size_F_grad*sizeof(double));
errorCheck(err);
//Allocate Device memory for LF
double *LF;
err= cudaMalloc((void**)&LF, 544*sizeof(double));
errorCheck(err);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
double theta=0;
cudaEventRecord(start,0);
while(theta <180)
{
theta+=0.18;
calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
count++;
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsedTime, start, stop );
err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
}
else
{
fprintf( stderr, "\n \n Cuda NO error in file '%s' in line %i : %s.\n",
__FILE__, __LINE__, cudaGetErrorString( err) );
printf("\n 180 orientation updates: Total Time = %3.10f ms\n",elapsedTime);
}
return 0;
}
在文件“MatrixMul_kernel.cu”
中 #define HDM_DIM 3
__global__ void calc_LF(double* ori_dev, double* X_dev, double* Y_dev, double* Z_dev, double* F_dev, double* F_grad_dev,
double* g_oriD, double* r_vD, double* LF)
{
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
double r_v[3];
double g_ori[3];
// Each thread reads 1 row (3 values) of r_vD
r_v[0] = getElement(r_vD, tx, 0, HDM_DIM);
r_v[1] = getElement(r_vD, tx, 1, HDM_DIM);
r_v[2] = getElement(r_vD, tx, 2, HDM_DIM);
// Each thread reads 1 row (3 values) of g_oriD (which contains grad.ori data)
g_ori[0] = getElement(g_oriD, tx, 0, HDM_DIM);
g_ori[1] = getElement(g_oriD, tx, 1, HDM_DIM);
g_ori[2] = getElement(g_oriD, tx, 2, HDM_DIM);
//fetch d_ori from global memory
double d_ori[3];
for(int i=0; i< 3; i++){
d_ori[i]= ori_dev[3*gtx+i];
}
//read this block's X, Y, Z location
double x= X_dev[blockIdx.x];
double y= Y_dev[blockIdx.y];
double z= Z_dev[blockIdx.z];
double c2[HDM_DIM];
c2[0]= d_ori[1]*z - d_ori[2]*y;
c2[1]= d_ori[2]*x - d_ori[0]*z;
c2[2]= d_ori[0]*y - d_ori[1]*x;
// Fetch F and F_grad from global memory
double F = F_dev[gtx];
double F_grad[3];
for(int j=0; j<3; j++)
{
F_grad[j] = F_grad_dev[gtx*3+j];
}
double c1[HDM_DIM];
c1[0]= F* c2[0];
c1[1]= F* c2[1];
c1[2]= F* c2[2];
double d3= c2[0]*r_v[0] + c2[1]*r_v[1] + c2[2]*r_v[2];
double s2[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
s2[j] = d3*F_grad[j];
}
double s1[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
s1[j] = c1[j] - s2[j];
}
double b_v[HDM_DIM];
for(int j=0; j<HDM_DIM; j++)
{
b_v[j] = (10^-7)/(F*F) * s1[j];
}
double sum=0;
for(int j=0; j<HDM_DIM; j++)
{
sum += b_v[j]*g_ori[j];
}
// Write this thread's value to global memory
LF[tx]= sum;
}
值得一提的是,这个calc_LF内核用于将其最终结果写入共享内存,这将执行时间从大约500毫秒增加到大约2,500毫秒(即,只是共享内存写行大致将时间乘以5)。
__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double T[H];
__shared__ double G[H];
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
int myTRA[W];
double my_LF[W];
for (int i=0; i<W; i++){
my_LF[i]= LF[gtx];
}
for(int j=0; j<W; j++){
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * my_LF[j];
}
// Write your sum to shared memory
G[tx]=sum;
__syncthreads();
}
答案 0 :(得分:1)
您看到的效果是编译器优化的结果。获取基本内核代码的可编译版本:
#define H (128)
#define W (128)
__device__
double getElement(const double *g, int t, int j, int w)
{
return g[t + j*w];
}
__global__
void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
__shared__ double G[H];
// Get this block's global index
int blockId= blockIdx.x + gridDim.x*blockIdx.y +
gridDim.x*gridDim.y*blockIdx.z;
int tx= threadIdx.x;
// This thread's global index
int gtx= blockId*blockDim.x + threadIdx.x;
int myTRA[W];
double my_LF[W];
for (int i=0; i<W; i++){
my_LF[i]= LF[gtx];
}
for(int j=0; j<W; j++){
myTRA[j]= getElement(g_traD, tx, j, W);
}
double sum;
for(int j=0; j<W; j++)
{
sum += myTRA[j] * my_LF[j];
}
// Write your sum to shared memory
G[tx]=sum;
__syncthreads();
}
并用CUDA 5编译它给出了这个:
$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v" dead_code.cu
dead_code.cu(13): warning: variable "G" was set but never used
dead_code.cu(13): warning: variable "G" was set but never used
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info : Function properties for _Z6calc_SPdS_S_S_
1536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 23 registers, 1024 bytes smem, 64 bytes cmem[0]
有一个关于共享内存变量G
没有被使用的警告,但编译器尊重它并发出消耗23个寄存器的代码。所以现在,如果我在内核的末尾注释掉G[tx]=sum
,它会像这样编译:
$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v" dead_code.cu
dead_code.cu(13): warning: variable "G" was declared but never referenced
dead_code.cu(13): warning: variable "G" was declared but never referenced
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info : Function properties for _Z6calc_SPdS_S_S_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 2 registers, 64 bytes cmem[0]
现在只使用了两个寄存器,工具链发出了这个:
$ cuobjdump -sass dead_code.cubin
code for sm_20
Function : _Z6calc_SPdS_S_S_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0xfc1fdc03207e0000*/ IMAD.U32.U32 RZ, R1, RZ, RZ;
/*0010*/ /*0xffffdc0450ee0000*/ BAR.RED.POPC RZ, RZ;
/*0018*/ /*0x00001de780000000*/ EXIT;
即。四个装配说明。你的所有代码都消失了。
此效果的基础源是编译器死代码删除。编译器足够聪明,可以确定对全局或共享内存输出没有影响的代码是不需要的,可以删除。在这种情况下,删除了对G
的写入,整个内核实际上没有意义,编译器只是优化了整个事情。您可以看到其他一些死代码删除及其效果here和here的示例。后者在OpenCL中,但适用相同的机制。