我正在为某些要点建立一个kNN。构建cuda文件并运行测试代码后,我总是得到RuntimeError: CUDA error: an illegal memory access was encountered
。我使用两个张量,new_xyz
和xyz
。 new_xyz
用于查询,xyz
是搜索基础。问题是当我访问xyz
中的数据时。这会导致内存泄漏。我不知道这种情况。你们有什么主意吗?
#include <ATen/cuda/CUDAContext.h>
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
__global__ void ball_query_kernel_fast(int n1, int n2, float radius, int nsample,
const long *__restrict__ new_xyz, const long *__restrict__ xyz, long *__restrict__ idx) {
// new_xyz: ( n1, 4)
// xyz: ( n2, 4)
// output:
// idx: ( n1, nsample)
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= 1 || pt_idx >= n1) return;
new_xyz += pt_idx * 4;//bs_idx * n1 * 4 +
xyz += 0;//bs_idx * n2 * 4
idx += pt_idx * nsample;//bs_idx * n1 * nsample +
float radius2 = radius * radius;
float new_x = (float) new_xyz[0];
float new_y = (float) new_xyz[1];
float new_z = (float) new_xyz[2];
long new_b = new_xyz[3];
//return;
long num_cnt = 0;
//long b = xyz[0 * 4 + 3];
for (long i = 0; i < (long)n2; i++) {
//float x = (float) xyz[i * 4 + 0];
//float y = (float) xyz[i * 4 + 1];
//float z = (float) xyz[i * 4 + 2];
//long b = xyz[i * 4 + 3];
if (new_b!=xyz[i * 4 + 3]){
continue;
}
float d2 = (new_x - xyz[i * 4 + 0]) * (new_x - xyz[i * 4 + 0]) +
(new_y - xyz[i * 4 + 1]) * (new_y - xyz[i * 4 + 1]) +
(new_z - xyz[i * 4 + 2]) * (new_z - xyz[i * 4 + 2]);
if (d2 < radius2){
idx[num_cnt] = i;
++num_cnt;
if (num_cnt >= nsample) break;
}
}
for (long l = num_cnt; l < nsample; l++) {idx[l] = -1;}
}
void ball_query_kernel_launcher_fast(int n1, int n2, float radius, int nsample, const long *new_xyz, const long *xyz, long *idx) {
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
cudaError_t err;
dim3 blocks(DIVUP(n1, THREADS_PER_BLOCK), 1); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
ball_query_kernel_fast<<<blocks, threads,0,stream>>>(n1, n2, radius, nsample, new_xyz, xyz, idx);//,0, stream
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
这是我的Cuda代码。看起来一切正常,但不起作用。我找到了删除一些暂停代码并运行测试的问题。如果我删除了xyz
中对ball_query_kernel_fast
的所有访问权限,则没有问题。