我正在尝试将此代码用于3D类型结构。我正在使用Cuda的2D功能。因此主机侧线性数据('板')的大小宽度为*高*深度,2D malloc为宽x高*深(此处宽度和高度均为DIMxDIM元素)。内核处理从A到B的数据。我在行处获得了非法的内存访问错误(使用内存检查器)
dst[offset] = curr;
如果我将malloc更改为HEIGHT * 2,则错误消失,但尺寸似乎匹配。我错过了什么?其他批评也是受欢迎的,我是C ++和CUDA的新手。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[WIDTH*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = NULL;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != NULL) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)];
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
dst[offset] = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS));
}
/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}
/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1)
return false;
}
else {
return false;
}
}
}
return true;
}
CUDA调试器的输出:
Memory Checker detected 384 access violations.
error = access violation on load (global memory)
gridid = 18
blockIdx = {0,8,0}
threadIdx = {0,4,0}
address = 0x05d08000
accessSize = 4
答案 0 :(得分:2)
这看起来不对:
sint *board = new sint[WIDTH*HEIGHT];
我认为你的意思是:
sint *board = new sint[DIM*HEIGHT];
这看起来不对:
unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;
我认为你的意思是:
unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;
但问题的关键在于你在计算索引到sint
数组的算术中使用音高值(计算行宽字节)。当您以这种方式计算指数时,您需要将您的音高值缩放sizeof(sint)
。即使这不是相当正确。正确的做法是转换为unsigned char
指针,按行间距(即字节)进行算术运算,然后将行开始指针从unsigned char
转换回sint
},然后通过(x+xoff)
从那里索引。实际上,这意味着您的calcOffset
例程需要重写,并且需要接受底层指针作为参数,并返回指针。
所以这段代码有这些变化:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
typedef signed int sint;
typedef unsigned int uint;
#define DIM 512
#define TPB 32 // Threads per block
#define CLEARANCE 5
#define MAPLAYERS 2
#define WIDTH (sizeof(sint) * DIM)
#define HEIGHT (DIM * MAPLAYERS)
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest);
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index);
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch);
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff);
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch);
dim3 blocks(DIM / TPB, DIM / TPB, MAPLAYERS);
dim3 threads(TPB, TPB);
/** CUDA Error Check */
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
int tmp;
std::cin >> tmp;
exit(code);
}
}
int main(void) {
sint *A;
sint *B;
size_t pitchA, pitchB;
sint *board = new sint[DIM*HEIGHT];
CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT));
CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT));
CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT));
CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT));
route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1);
CER(cudaFree(A));
CER(cudaFree(B));
delete[] board;
}
void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) {
unsigned long *dev_index;
unsigned long index = 0;
CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long)));
CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice));
map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index);
CER(cudaPeekAtLastError());
CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost));
if (index != 0) {
// break condition
}
}
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int z = threadIdx.z + blockIdx.z * blockDim.z;
sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst);
sint curr;
if (!inBounds(x, y, z, 0, 0, 0))
return;
curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc);
if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) {
// Processing
}
else
*dst_offset = 1;
return;
}
/** Finds linear offset for a given pixel and offset. */
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) {
unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr);
return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT / MAPLAYERS))));
}
/** Checks if position is valid on the map. */
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) {
if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS)
return false;
return true;
}
/** Returns true if a block has clearnace */
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) {
for (int c = -CLEARANCE; c <= CLEARANCE; c++) {
for (int r = -CLEARANCE; r <= CLEARANCE; r++){
if (inBounds(x, y, z, r, c, 0)){
if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1)
return false;
}
else {
return false;
}
}
}
return true;
}
将来,您可能希望使用非间距分配来使代码正常工作。一旦你有了工作,你就可以看到添加投资分配是否会给你带来任何性能优势。
如果(x+xoff)
为负数(或(x+xoff)
导致下一行行的索引),即使这样也不会有效。您不能以这种方式在倾斜分配中从一行向后索引(或向下一行)。首先需要将(x+xoff)
解析为引用的实际行,然后在该行中开发索引,然后执行针对该行的调整计算。