Question

我有这个cuda文件：

#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>

#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5

using namespace cimg_library;

__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
   // Map threadIdx/blockIdx to pixel position
   int x = threadIdx.x + blockIdx.x * blockDim.x;
   int y = threadIdx.y + blockIdx.y * blockDim.y;
   int offset = x + y * blockDim.x * gridDim.x;
   float ox = x - blockDim.x * gridDim.x / 2;
   float oy = y - blockDim.y * gridDim.y / 2;
   float r = 0.2, g = 0.2, b = 0.5;
   float maxz = -INF;
   for (int i = 0; i < SPHERES_COUNT; i++) {
       float n, t = s[i].hit(ox, oy, &n);
       if (t > maxz) {
           float fscale = n;
           r = s[i].r * fscale;
           g = s[i].g * fscale;
           b = s[i].b * fscale;
           maxz = t;
       }
   }

   bitmap[offset*3] = (int)(r * 255);
   bitmap[offset*3 + 1] = (int)(g * 255);
   bitmap[offset*3 + 2] = (int)(b * 255);
}

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    //Capture start time
    cudaEvent_t start, stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));
    HANDLE_ERROR(cudaEventRecord(start, 0));

    //Create host bitmap
    CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
    image.permute_axes("cxyz");

    //Allocate device bitmap data
    unsigned char* dev_bitmap;
    HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));

    //Generate spheres and copy them on the GPU one by one
    Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
    for (int i=0; i <SPHERES_COUNT; i++) {
        temp_s[i].r = rnd(1.0f);
        temp_s[i].g = rnd(1.0f);
        temp_s[i].b = rnd(1.0f);
        temp_s[i].x = rnd(1000.0f) - 500;
        temp_s[i].y = rnd(1000.0f) - 500;
        temp_s[i].z = rnd(1000.0f) - 500;
        temp_s[i].radius = rnd(100.0f) + 20;
    }

    HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
    free(temp_s);

    //Generate a bitmap from spere data
    dim3 grids(WIDTH/16, HEIGHT/16);
    dim3 threads(16, 16);
    kernel<<<grids, threads>>>(dev_bitmap, s);

    //Copy the bitmap back from the GPU for display
    HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
                            image.size()*sizeof(unsigned char),
                            cudaMemcpyDeviceToHost));

    cudaFree(dev_bitmap);

    image.permute_axes("yzcx");
    image.save("render.bmp");
}

它编译得很好，但是在执行时我得到了这个错误：

an illegal memory access was encountered in main.cu at line 82

就是这里：

    //Copy the bitmap back from the GPU for display
    HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
                            image.size()*sizeof(unsigned char),
                            cudaMemcpyDeviceToHost));

我无法理解为什么...... 我知道如果删除这个：

  bitmap[offset*3] = (int)(r * 255);
  bitmap[offset*3 + 1] = (int)(g * 255);
  bitmap[offset*3 + 2] = (int)(b * 255);

没有报告错误，所以我认为这可能是一个索引错误，稍后报道，但我有一个相同版本的程序，不使用常量内存，并且它可以正常使用相同的版本内核函数...

Answer 1

这里有两个问题。首先是：

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    ......

    kernel<<<grids, threads>>>(dev_bitmap, s);

    ......

在主机代码中，s是一个主机内存变量，它为CUDA运行时提供了一个句柄，用于连接设备常量内存符号。它不包含有效的设备指针，不能传递给内核调用。结果是无效的内存访问错误。

你可以这样做：

__constant__ Sphere s[SPHERES_COUNT];

int main ()
{
    ......

    Sphere *d_s;
    cudaGetSymbolAddress((void **)&d_s, s);
    kernel<<<grids, threads>>>(dev_bitmap, d_s);

    ......

会导致符号查找以获取s的设备地址，将其传递给内核是有效的。但是，GPU依靠编译器发出特定指令来通过常量缓存访问内存。设备编译器只有在检测到内核中正在访问__constant__变量时才会发出这些指令，这在使用指针时是不可能的。您可以在this Stack Overflow question and answer中看到有关编译器如何为常量变量访问生成代码的更多信息。

CUDA，“Memcpy遇到非法内存访问”

1 个答案: