Question

这里我想计算每两个点的距离，并确定它们是否是邻居。这是我在cuda的简单代码。

__global__ void calcNeighbors(const DataPoint* points,
  const float doubleRadius, bool* neighbors) {

int tid = threadIdx.x + blockIdx.x * blockDim.x;

float dis = 0.0f;
while (tid < N) {
   DataPoint p1 = points[tid];
   for (int i=0; i<N; i++) {
       DataPoint p2 = points[i];
       dis = 0;
       dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
           (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
           (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
       if (dis <= doubleRadius) {
           neighbors[tid*N+i] = true;
       } else {
           neighbors[tid*N+i] = false;
       }
   }   

   tid += blockDim.x * gridDim.x;
}
}

DataPoint是一个结构

typedef struct DataPoint {
    float pfDimens[3];
} DataPoint;

所以我想减少时间，我该怎么办？我曾尝试使用内存合并并共享内存，但我的速度并不快？

===============使用共享内存==============

__global__ void calcNeighbors2(const DataPoint* points, 
  const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];

int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
    sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();

int tid = threadIdx.x;
float dis;
while (tid < N) {
    DataPoint p1 = points[tid];
    for (int i=start; i<len; i++) {
       dis = 0;
       dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) + 
           (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) + 
           (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
       if (dis <= doubleRadius) {
           neighbors[i*N+tid] = true;
       } else {
           neighbors[i*N+tid] = false;
       }

    }

    tid += blockDim.x;
}
}

这里我将邻居[tid * N + i]更改为邻居[i * N + tid]，它让我在特斯拉K10.G2.8GB上加速8倍速。但是当我使用共享内存存储一些点时，它没用？

Answer 1

至少有4个想法，其中一些已在评论中说明：

将您的点距离存储转换为AoS格式：
```
struct DataPoint {
  float pfDimens[3];
};
```
到SoA格式：
```
struct DataPoint {
  float pfDimens_x[NPTS];
  float pfDimens_y[NPTS];
  float pfDimens_z[NPTS];
};
```
这将在加载数据时启用完全合并。事实上，为了帮助解决下面的第4点，我只是转而使用3个裸阵列，而不是结构。
将计算减少到（略小于）一半：
```
for (int i=N-1; i>tid; i--) {
```
然后，无论是在线程代码本身，还是在主机中，您都可以通过复制数据来填充输出矩阵的另一半“。
将存储转置到输出矩阵中，以便您可以编写如下存储操作：
```
neighbors[i*N+tid] = true;
```
这将很好地合并，而不是：
```
neighbors[tid*N+i] = true;
```
哪个不会。
由于您的输入点数据是只读的，请正确标记内核参数：
```
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
```
在某些情况下，以及在某些GPU上，由于使用the read-only cache，这通常会导致加速。如果你真的想积极进行缓存，并且你的数据阵列足够小（4K或更少float点），你可以将点数据的副本放在全局内存中以及{{1中的副本内存，并通过常量内存加载你在这里做的“统一”负载：
```
__constant__
```
因此，您可以通过只读缓存执行合并的加载，通过常量缓存执行统一加载，并将合并的存储转移到普通的全局内存。

在K40c上，在linux / CUDA 7上，对于DataPoint p2 = c_points[i]; = 4096，这些更改的净效果似乎是内核级别的3.5倍加速：

在K40c的情况下，由于延迟，上面（16）启动的有限数量的块是性能的重大障碍。如果我们注释掉$ cat t749.cu #include <stdio.h> #define N 4096 // if N is 16K/3 or less, we can use constant #define USE_CONSTANT #define THRESH 0.2f #define nTPB 256 #define nBLK (N/nTPB+1) #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) #include <time.h> #include <sys/time.h> #define USECPSEC 1000000ULL unsigned long long dtime_usec(unsigned long long start){ timeval tv; gettimeofday(&tv, 0); return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start; } struct DataPoint { float pfDimens[3]; }; __global__ void calcNeighbors(const DataPoint* points, const float doubleRadius, bool* neighbors) { int tid = threadIdx.x + blockIdx.x * blockDim.x; float dis = 0.0f; while (tid < N) { DataPoint p1 = points[tid]; for (int i=0; i<N; i++) { DataPoint p2 = points[i]; dis = 0; dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) + (p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) + (p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]); if (dis <= doubleRadius) { neighbors[tid*N+i] = true; } else { neighbors[tid*N+i] = false; } } tid += blockDim.x * gridDim.x; } } #ifdef USE_CONSTANT __constant__ float cpx[N]; __constant__ float cpy[N]; __constant__ float cpz[N]; #endif __global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) { int tid = threadIdx.x+blockDim.x*blockIdx.x; while (tid < N) { float p1x = pts_x[tid]; float p1y = pts_y[tid]; float p1z = pts_z[tid]; for (int i = N-1; i > tid; i--){ float p2x, p2y, p2z; #ifdef USE_CONSTANT p2x = cpx[i]; p2y = cpy[i]; p2z = cpz[i]; #else p2x = pts_x[i]; p2y = pts_y[i]; p2z = pts_z[i]; #endif float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z)); neighbors[i*N+tid] = (dis <= doubleRadius); } tid += blockDim.x * gridDim.x; } } int main(){ float *dx, *dy, *dz, *hx, *hy, *hz; DataPoint *dp, *hp; bool *dn, *hn1, *hn2; hx =(float *)malloc(N*sizeof(float)); hy =(float *)malloc(N*sizeof(float)); hz =(float *)malloc(N*sizeof(float)); hp =(DataPoint *)malloc(N*sizeof(DataPoint)); hn1=(bool *)malloc(N*N*sizeof(bool)); hn2=(bool *)malloc(N*N*sizeof(bool)); cudaMalloc(&dx, N*sizeof(float)); cudaMalloc(&dy, N*sizeof(float)); cudaMalloc(&dz, N*sizeof(float)); cudaMalloc(&dp, N*sizeof(DataPoint)); cudaMalloc(&dn, N*N*sizeof(bool)); for (int i =0; i < N; i++){ hx[i] = rand()/(float)RAND_MAX; hy[i] = rand()/(float)RAND_MAX; hz[i] = rand()/(float)RAND_MAX; hp[i].pfDimens[0] = hx[i]; hp[i].pfDimens[1] = hy[i]; hp[i].pfDimens[2] = hz[i];} cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice); // warm-up calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn); cudaDeviceSynchronize(); cudaMemset(dn, 0, N*N*sizeof(bool)); unsigned long long t1 = dtime_usec(0); calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn); cudaDeviceSynchronize(); cudaCheckErrors("kernel 1 error"); t1 = dtime_usec(t1); cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost); // warm-up calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn); cudaDeviceSynchronize(); cudaMemset(dn, 0, N*N*sizeof(bool)); unsigned long long t2 = dtime_usec(0); calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn); cudaDeviceSynchronize(); cudaCheckErrors("kernel 2 error"); t2 = dtime_usec(t2); cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost); cudaCheckErrors("some error"); printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC); // results validation for (int i = 0; i < N; i++) for (int j = i+1; j < N; j++) if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;} return 0; } $ nvcc -arch=sm_35 -o t749 t749.cu $ ./t749 t1: 0.004903s, t2: 0.001395s $定义，并将USE_CONSTANT更改为16384，我们会发现使用改进的内核可以获得更高的加速：

得到的~48个块足以大约“填充”具有15个SM的K40c。

编辑：现在您已经发布了共享内存内核，我将其作为 $ ./t749 t1: 0.267107s, t2: 0.008209s $ 添加到我的测试用例中，并将其与时间性能进行了比较（如calcNeighbors3）。它几乎与我的内核一样快，似乎提供了正确的结果（与原始内核相匹配），所以我不确定你的顾虑是什么。

这是更新的代码和测试用例：

t3

对于此测试，我已将数据集大小更改为32768，因为它更接近您关注的范围。你的共享内存内核显示了比原始内核快42倍的速度，而我的内核在我的K40c上显示了大约55倍的加速。

cuda钙点距离为两点

1 个答案: