Question

我又问了同样的问题。 https://stackoverflow.com/a/21200781/3208577 Roger Dahl回答了这个问题，但在运行代码之后输出错误了。我不知道为什么会这样，从我的理解一切都是正确的。在main（）函数中，距离矩阵的输出显示只有（i，0）元素被正确填充，其他元素为0.有人可以解释为什么会这样吗？让我重新发布罗杰斯的代码：

    #include  "cuda_runtime.h"
    #include <iostream>

    using namespace std;

    const int N(20);

    #define check(ans) { _check((ans), __FILE__, __LINE__); }
    inline void _check(cudaError_t code, char *file, int line)
    {
      if (code != cudaSuccess) {
        fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
        exit(code);
      }
    }

    int div_up(int a, int b) {
      return ((a % b) != 0) ? (a / b + 1) : (a / b);
    }

    __global__ void calc_distances(double* distances,
      double* atoms_x, double* atoms_y, double* atoms_z);

    int main(int argc, char **argv)
    {
      double* atoms_x_h;
      check(cudaMallocHost(&atoms_x_h, N * sizeof(double)));

      double* atoms_y_h;
      check(cudaMallocHost(&atoms_y_h, N * sizeof(double)));

      double* atoms_z_h;
      check(cudaMallocHost(&atoms_z_h, N * sizeof(double)));

      for (int i(0); i < N; ++i) {
        atoms_x_h[i] = i;
        atoms_y_h[i] = i;
        atoms_z_h[i] = i;
      }

      double* atoms_x_d;
      check(cudaMalloc(&atoms_x_d, N * sizeof(double)));

      double* atoms_y_d;
      check(cudaMalloc(&atoms_y_d, N * sizeof(double)));

      double* atoms_z_d;
      check(cudaMalloc(&atoms_z_d, N * sizeof(double)));

      check(cudaMemcpy(atoms_x_d, atoms_x_h, N * sizeof(double), cudaMemcpyHostToDevice));
      check(cudaMemcpy(atoms_y_d, atoms_y_h, N * sizeof(double), cudaMemcpyHostToDevice));
      check(cudaMemcpy(atoms_z_d, atoms_z_h, N * sizeof(double), cudaMemcpyHostToDevice));

      double* distances_d;
      check(cudaMalloc(&distances_d, N * N * sizeof(double)));

      const int threads_per_block(256);
      dim3 n_blocks(div_up(N, threads_per_block));

      calc_distances<<<n_blocks, threads_per_block>>>(distances_d, atoms_x_d, atoms_y_d, atoms_z_d);

      check(cudaPeekAtLastError());
      check(cudaDeviceSynchronize());

      double* distances_h;
      check(cudaMallocHost(&distances_h, N * N * sizeof(double)));

      check(cudaMemcpy(distances_h, distances_d, N * N * sizeof(double), cudaMemcpyDeviceToHost));
//wrong output here
      for (int i(0); i < N; ++i) {
        for (int j(0); j < N; ++j) {
          cout << "(" << i << "," << j << "): " << distances_h[i + N * j] << endl;
        }
      }

      check(cudaFree(distances_d));
      check(cudaFreeHost(distances_h));
      check(cudaFree(atoms_x_d));
      check(cudaFreeHost(atoms_x_h));
      check(cudaFree(atoms_y_d));
      check(cudaFreeHost(atoms_y_h));
      check(cudaFree(atoms_z_d));
      check(cudaFreeHost(atoms_z_h));

      return 0;
    }

    __global__ void calc_distances(double* distances,
      double* atoms_x, double* atoms_y, double* atoms_z)
    {
      int i(threadIdx.x + blockIdx.x * blockDim.x);
      int j(threadIdx.y + blockIdx.y * blockDim.y);

      if (i >= N || j >= N) {
        return;
      }

      distances[i + N * j] =
        (atoms_x[i] - atoms_x[j]) * (atoms_x[i] - atoms_x[j]) +
        (atoms_y[i] - atoms_y[j]) * (atoms_y[i] - atoms_y[j]) +
        (atoms_z[i] - atoms_z[j]) * (atoms_z[i] - atoms_z[j]);
    }

Answer 1

对于您编写的内核，您的启动参数是错误的。他们应该是

  const int threads_per_dir(16);
  dim3 threads_per_block(threads_per_dir,threads_per_dir);
  dim3 n_blocks(div_up(N, threads_per_block.x), div_up(N,threads_per_block.y));

当您在代码中启动1D网格上的1D块时，您的threadIdx.y和blockIdx.y始终为0，因此j始终为0.

Cuda，距离计算btw 3d对象，再次

1 个答案: