Question

我正在为某些要点建立一个kNN。构建cuda文件并运行测试代码后，我总是得到RuntimeError: CUDA error: an illegal memory access was encountered。我使用两个张量，new_xyz和xyz。 new_xyz用于查询，xyz是搜索基础。问题是当我访问xyz中的数据时。这会导致内存泄漏。我不知道这种情况。你们有什么主意吗？

#include <ATen/cuda/CUDAContext.h>
#include <ATen/ATen.h>

#include <cuda.h>
#include <cuda_runtime.h>

#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))

__global__ void ball_query_kernel_fast(int n1, int n2, float radius, int nsample,
    const long *__restrict__ new_xyz, const long *__restrict__ xyz, long *__restrict__ idx) {
    // new_xyz: ( n1, 4)
    // xyz: ( n2, 4)
    // output:
    //      idx: ( n1, nsample)
    int bs_idx = blockIdx.y;
    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (bs_idx >= 1 || pt_idx >= n1) return;

    new_xyz +=  pt_idx * 4;//bs_idx * n1 * 4 +
    xyz += 0;//bs_idx * n2 * 4
    idx +=  pt_idx * nsample;//bs_idx * n1 * nsample +

    float radius2 = radius * radius;
    float new_x = (float) new_xyz[0];
    float new_y = (float) new_xyz[1];
    float new_z = (float) new_xyz[2];
    long new_b = new_xyz[3];
    //return;
    long num_cnt = 0;
    //long b = xyz[0 * 4 + 3];
    for (long i = 0; i < (long)n2; i++) {
        //float x = (float)  xyz[i * 4 + 0];
        //float y = (float) xyz[i * 4 + 1];
        //float z = (float) xyz[i * 4 + 2];
        //long b = xyz[i * 4 + 3];
        if (new_b!=xyz[i * 4 + 3]){
            continue;
        }
        float d2 = (new_x - xyz[i * 4 + 0]) * (new_x - xyz[i * 4 + 0]) +
            (new_y - xyz[i * 4 + 1]) * (new_y - xyz[i * 4 + 1]) +
            (new_z - xyz[i * 4 + 2]) * (new_z - xyz[i * 4 + 2]);

        if (d2 < radius2){
            idx[num_cnt] = i;
            ++num_cnt;
            if (num_cnt >= nsample) break;
        }
    }
    for (long l = num_cnt; l < nsample; l++) {idx[l] = -1;}

}

void ball_query_kernel_launcher_fast(int n1, int n2, float radius, int nsample, const long *new_xyz, const long *xyz, long *idx) {
    // new_xyz: (B, M, 3)
    // xyz: (B, N, 3)
    // output:
    //      idx: (B, M, nsample)

    cudaError_t err;

    dim3 blocks(DIVUP(n1, THREADS_PER_BLOCK), 1);  // blockIdx.x(col), blockIdx.y(row)
    dim3 threads(THREADS_PER_BLOCK);
    cudaStream_t stream = at::cuda::getCurrentCUDAStream();

    ball_query_kernel_fast<<<blocks, threads,0,stream>>>(n1, n2, radius, nsample, new_xyz, xyz, idx);//,0, stream
    // cudaDeviceSynchronize();  // for using printf in kernel function

    err = cudaGetLastError();
    if (cudaSuccess != err) {
        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
        exit(-1);
    }
}

这是我的Cuda代码。看起来一切正常，但不起作用。我找到了删除一些暂停代码并运行测试的问题。如果我删除了xyz中对ball_query_kernel_fast的所有访问权限，则没有问题。

RuntimeError：CUDA错误：使用pytorch C ++扩展时遇到非法内存访问

0 个答案: