我有这个cuda文件:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
它编译得很好,但是在执行时我得到了这个错误:
an illegal memory access was encountered in main.cu at line 82
就是这里:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
我无法理解为什么...... 我知道如果删除这个:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
没有报告错误,所以我认为这可能是一个索引错误,稍后报道,但我有一个相同版本的程序,不使用常量内存,并且它可以正常使用相同的版本内核函数...
答案 0 :(得分:3)
这里有两个问题。首先是:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
在主机代码中,s
是一个主机内存变量,它为CUDA运行时提供了一个句柄,用于连接设备常量内存符号。它不包含有效的设备指针,不能传递给内核调用。结果是无效的内存访问错误。
你可以这样做:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
会导致符号查找以获取s
的设备地址,将其传递给内核是有效的。但是,GPU依靠编译器发出特定指令来通过常量缓存访问内存。设备编译器只有在检测到内核中正在访问__constant__
变量时才会发出这些指令,这在使用指针时是不可能的。您可以在this Stack Overflow question and answer中看到有关编译器如何为常量变量访问生成代码的更多信息。