我是CUDA的新手。我只编写了一个短代码,用于测试内核以计算质量粒子的加速度。我只使用 time ./example 测试它。我有Kubuntu 12.04,Intel(R)Core(TM)i5 CPU 760 @ 2.80GHz,GeForce GTX 560,并使用nvcc -O3 -arch = sm_20 -o示例example.cu进行编译。这是我的代码。
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
__global__ void acc_sh(double *x, double *y, double *z, double *ax, double *ay, double *az, double *mass, int N)
{
extern __shared__ double4 shPos[]; //make dynamic
int p = blockDim.x;
int idx = blockIdx.x*p + threadIdx.x;
if (idx > N-1) return;
double3 acc = (double3){0.0,0.0,0.0};
double posx = x[idx];
double posy = y[idx];
double posz = z[idx];
// Tile
for (int k = 0; k < N; k += p) {
//Load positions into shmem
shPos[threadIdx.x].x = x[k + threadIdx.x];
shPos[threadIdx.x].y = y[k + threadIdx.x];
shPos[threadIdx.x].z = z[k + threadIdx.x];
shPos[threadIdx.x].w = mass[k + threadIdx.x];
__syncthreads();
for (int j = 0; j < p && k + j < N; j++) {
//Loop over the shmem
double rijx = posx - shPos[j].x;
double rijy = posy - shPos[j].y;
double rijz = posz - shPos[j].z;
double dist = rijx*rijx + rijy*rijy + rijz*rijz;
double dist3 = dist*dist*dist;
double apre = 0.0;
if (dist3 != 0) //avoid self-interaction
{
apre = rsqrt(dist3)*shPos[j].w;
}
acc.x += apre*rijx;
acc.y += apre*rijy;
acc.z += apre*rijz;
}
__syncthreads();
}
ax[idx] = acc.x;
ay[idx] = acc.y;
az[idx] = acc.z;
}
__global__ void acc(double *x, double *y, double *z, double *ax, double *ay, double *az, double *mass, int N)
{
int p = blockDim.x;
int idx = blockIdx.x*p + threadIdx.x;
if (idx > N-1) return;
double3 acc = (double3){0.0,0.0,0.0};
double posx = x[idx];
double posy = y[idx];
double posz = z[idx];
// Do not use shmem and loop over all bodies
for (int k = 0; k < N; k++) {
double rijx = posx - x[k];
double rijy = posy - y[k];
double rijz = posz - y[k];
double dist = rijx*rijx + rijy*rijy + rijz*rijz;
double dist3 = dist*dist*dist;
double apre = 0.0;
if (dist3 != 0) //avoid self-interaction
{
apre = rsqrt(dist3)*mass[k];
}
acc.x += apre*rijx;
acc.y += apre*rijy;
acc.z += apre*rijz;
__syncthreads();
}
ax[idx] = acc.x;
ay[idx] = acc.y;
az[idx] = acc.z;
}
int main()
{
srand(time(NULL));
const int N = 16384;
double t, dt, tend;
//INIT TEST PARTICLES
// HOST
double *x, *y, *z, *mass;
double *ax, *ay, *az, *dmass;
//DEVICE
double *dx, *dy, *dz;
double *dax, *day, *daz;
double size = N*sizeof(double);
cudaMalloc((void**)&dx, size);
cudaMalloc((void**)&dy, size);
cudaMalloc((void**)&dz, size);
cudaMalloc((void**)&dmass, size);
cudaMalloc((void**)&dax, size);
cudaMalloc((void**)&day, size);
cudaMalloc((void**)&daz, size);
x = (double*) malloc(size);
y = (double*) malloc(size);
z = (double*) malloc(size);
mass = (double*) malloc(size);
ax = (double*) malloc(size);
ay = (double*) malloc(size);
az = (double*) malloc(size);
for (int i = 0; i < N; i++)
{
x[i] = (double) rand()/RAND_MAX;
y[i] = (double) rand()/RAND_MAX;
z[i] = (double) rand()/RAND_MAX;
mass[i] = (double) rand()/RAND_MAX;
// printf("%d %10.5e %10.5e %10.5e %10.5e \n", i, x[i], y[i], z[i], mass[i]);
ax[i] = 0;
ay[i] = 0;
az[i] = 0;
}
cudaMemcpy(dx, x, size, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, size, cudaMemcpyHostToDevice);
cudaMemcpy(dz, z, size, cudaMemcpyHostToDevice);
cudaMemcpy(dmass, mass, size, cudaMemcpyHostToDevice);
cudaMemcpy(dax, ax, size, cudaMemcpyHostToDevice);
cudaMemcpy(day, ay, size, cudaMemcpyHostToDevice);
cudaMemcpy(daz, az, size, cudaMemcpyHostToDevice);
t = 0.0; //start integ. time
tend = 365.0; //end integr. time, about one year
dt = 1.0;
int TPB = 128;
int BPG = (N/TPB)+1;
//********************************************************
//********************************************************
//********************************************************
//MAIN CYCLE**********************************************
//********************************************************
//********************************************************
//********************************************************
while (t <= tend) {
printf("time [d] %24.20f \n", t);
acc_sh<<< BPG, TPB, sizeof(double4)*TPB >>>(dx,dy,dz,dax,day,daz,dmass,N);
//acc<<< BPG, TPB >>>(dx,dy,dz,dax,day,daz,dmass,N);
t += dt;
}
cudaMemcpy(x, dx, size, cudaMemcpyDeviceToHost);
cudaMemcpy(y, dy, size, cudaMemcpyDeviceToHost);
cudaMemcpy(z, dz, size, cudaMemcpyDeviceToHost);
cudaMemcpy(ax, dax, size, cudaMemcpyDeviceToHost);
cudaMemcpy(ay, day, size, cudaMemcpyDeviceToHost);
cudaMemcpy(az, daz, size, cudaMemcpyDeviceToHost);
//********************************************************
//********************************************************
//********************************************************
//OUTPUT RESULTS******************************************
//********************************************************
//********************************************************
//********************************************************
/*for (int j = 0; j < N; j++) {
printf("%d %23.16e %23.16e %23.16e \n", j+1, ax[j], ay[j], az[j]);
}*/
cudaFree(dx);
cudaFree(dy);
cudaFree(dz);
cudaFree(ax);
cudaFree(ay);
cudaFree(az);
return 0;
}
当我运行它并测量应用程序运行的总时间时,我会获得这些运行时间:
没有共享(在MAIN CYCLE中只有 acc_sh 被评论): 真正的0m44.933s 用户0m32.838s sys 0m12.001s
共享(在MAIN CYCLE中只有 acc 被评论): 真实的0m44.259s 用户0m32.710s sys 0m11.445s
时代可比!为什么?我预计,当我使用使用共享内存的acc_sh时,它应该更快......下一个问题是:为什么程序开始时这么快,而 tend 它等待“某事“?
答案 0 :(得分:2)
请勿使用double
数量来指定要分配或转移的字节数:
double size = N*sizeof(double);
改为使用int
,unsigned
或size_t
。当我编译你的代码时,我会看到很多警告。
您的acc
内核代码中存在错误,会产生错误的结果和影响时间:
double rijy = posy - y[k];
double rijz = posz - y[k];
^
that should be z[k], not y[k]
此编码错误显着减少了非共享内核需要加载的数据量,这使得此内核(错误地)执行得更好。如果您不愿意比较并检查两种情况之间的结果,那么您也会发现它存在差异。
当我修复这些错误时,在我的特定设置中,非共享案例的时间约为21秒,共享案例的时间约为18秒。
如果你正在寻找从全球存储到共享存储器的10倍改进,那简直难以置信。共享内存带宽仅比全局内存带宽高出约5倍,因此即使在完美的情况下,预期10倍也是不合理的。此外,这种类型的比较可以降低GPU中L1和L2缓存的效果,这可以为频繁访问的数据带来全局内存访问,几乎达到共享内存的水平。
关于这个问题:&#34;为什么程序在开始时这么快,并且倾向于等待&#34;某些东西&#34;?&#34;内核启动是异步。在内核开始执行之前,内核启动会将控制权返回给主机线程。当您在这样的循环中启动内核时,它会启动,然后立即将控制权返回给主机线程(在该内核开始执行之前),从而启动下一个内核。