访问CUDA内核中的变量时的CUDA_ERROR_ILLEGAL_ADDRESS

时间:2016-05-27 17:16:18

标签: java cuda jcuda

我在尝试运行用于计算Buddhabrot分形轨道的内核时遇到CUDA_ERROR_ILLEGAL_ADDRESS异常。

extern "C"

__global__ void exec(int iterations, int size,
                float* inputR,  float* inputI, // Real/Imaginary input
                int* output                    // Output image in one dimension
                ) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    float cR = inputR[i];
    float cI = inputI[i];

    float x = 0;
    float y = 0;

    float outX[1000];
    float outY[1000];

    for (int j = 0; j < iterations; j++) {
        outX[j] = x;
        outY[j] = y;

        float xNew = (x * x) - (y * y) + cR;
        float yNew = (2 * x * y) + cI;

        if (xNew * xNew + yNew * yNew > 4) {
            for (int k = 1; k < j; k++) {
                int curX = (outX[k] + 2 ) * size / 4;
                int curY = (outY[k] + 2 ) * size / 4;

                int idx = curX + size * curY;

                output[idx]++; // <- exception here
            }
            return;
        }

        x = xNew;
        y = yNew;
    }
}

我现在尝试了很多东西,而且这个错误似乎与我最初想到的数据相反。例如,

output[0] = 0;

工作得很好。但是,当我尝试调试idx时(记住我首先认为错误与数组有关),我发现我既不能像这样分配idx

output[0] = idx;

也不在printf语句中使用它

if (i == 0) {
    printf("%d\n", idx);
}

我已尝试使用curXcurY同样拒绝工作,但cR例如可以正常工作而不会出现任何错误。在最里面的循环中分配的变量似乎有问题(我也不能分配k),所以我尝试在函数开头的所有循环之外声明idx,但是徒劳无功。仍然是同样的错误。

堆栈追踪:

Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
        at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
        at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
        at fractal.Buddhabrot.<init>(Buddhabrot.java:96)
        at controller.Controller.<init>(Controller.java:10)
        at Main.main(Main.java:8)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:497)
        at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)

常量:

block size            512*1*1
grid size             64 *1*1
iterations            1000
size                  256
inputR, inputI length 64*512
output length         256*256

MCVE:

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.*;

import java.io.File;
import java.util.Random;

import static jcuda.driver.JCudaDriver.*;

public class Stackoverflow {
    public static final int SIZE = 256;
    public static final long NUM_POINTS = 128 * 128 * 128;
    public static final int ITERATIONS = 10000;

    public static final int BLOCK_SIZE = 512;
    public static final int SIM_THREADS = BLOCK_SIZE * 64;

    public static final Random random = new Random();

    public static void main(String[] args) {
        File ptxFile = new File("Buddha.ptx");

        setExceptionsEnabled(true);
        cuInit(0);

        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);

        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFile.getAbsolutePath());

        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "exec");

        cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);

        float[] inR = new float[SIM_THREADS];
        float[] inI = new float[SIM_THREADS];

        int[] out = new int[SIZE * SIZE];

        CUdeviceptr deviceInputR = new CUdeviceptr();
        cuMemAlloc(deviceInputR, inR.length * Sizeof.FLOAT);
        CUdeviceptr deviceInputI = new CUdeviceptr();
        cuMemAlloc(deviceInputI, inI.length * Sizeof.FLOAT);

        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, out.length * Sizeof.INT);

        for (long i = 0; i < NUM_POINTS; i += SIM_THREADS) {
            for (int j = 0; j < SIM_THREADS; j++) {
                inR[j] = random.nextFloat() * 4f - 2f;
                inI[j] = random.nextFloat() * 4f - 2f;
            }

            System.out.println("GPU START");

            cuMemcpyHtoD(deviceInputR, Pointer.to(inR), inR.length * Sizeof.FLOAT);
            cuMemcpyHtoD(deviceInputI, Pointer.to(inI), inI.length * Sizeof.FLOAT);

            Pointer kernelParameters = Pointer.to(
                    Pointer.to(new int[]{ITERATIONS}),
                    Pointer.to(new int[]{SIZE}),
                    Pointer.to(deviceInputR),
                    Pointer.to(deviceInputI),
                    Pointer.to(deviceOutput)
            );

            int gridSize = (int) Math.ceil(((double) SIM_THREADS) / BLOCK_SIZE);

            cuLaunchKernel(function,
                    gridSize, 1, 1,
                    BLOCK_SIZE, 1, 1,
                    0, null,
                    kernelParameters, null
            );

            cuCtxSynchronize();

            System.out.println("GPU END");
        }

        cuMemcpyDtoH(Pointer.to(out), deviceOutput, out.length * Sizeof.INT);
    }
}

1 个答案:

答案 0 :(得分:3)

在你的&#34;常数&#34;您已表明过的部分:

iterations            1000

但是在你的java代码中(在你提供MCVE之后)你有这个:

public static final int ITERATIONS = 10000;

这显然会导致内核代码的这一部分中断:

float outX[1000];
float outY[1000];

for (int j = 0; j < iterations; j++) {
    outX[j] = x;
    outY[j] = y;

因为iterations的10000正在索引越界。 (此循环的范围实际上取决于数据,但对于某些数据输入模式,循环将遍历1000,如所写的那样。)

当我改变这个时:

public static final int ITERATIONS = 10000;

到此:

public static final int ITERATIONS = 1000;

您的代码正确运行:

$ cuda-memcheck java -cp ".:jcuda-0.7.5b.jar" so1
========= CUDA-MEMCHECK
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
GPU START
GPU END
========= ERROR SUMMARY: 0 errors
$