Question

我的OpenCL内核出现问题：

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable

struct complex {
    double im;
    double re;
    double r;
    double phi;
};

struct complex createComplexFromPolar(double _r, double _phi){
    struct complex t;
    t.r = _r;
    t.phi = _phi;

    t.re = cos(t.phi);
    t.im = sin(t.phi);

    return t;
}

struct complex createComplexFromKarthes(double real, double imag){
    struct complex t;
    t.re = real;
    t.im = imag;

    t.phi = atan(imag / real);
    t.r = sqrt(pow(real, 2) + pow(imag, 2));

    return t;
}

struct complex recreateComplexFromKarthes(struct complex t){
    return t = createComplexFromKarthes(t.re, t.im);
}

struct complex recreateComplexFromPolar(struct complex t){
    return t = createComplexFromPolar(t.r, t.phi);
}

struct complex addComplex(const struct complex z, const struct complex c){
    struct complex t;
    t.re = c.re + z.re;
    t.im = c.im + z.re;
    return recreateComplexFromKarthes(t);
}

struct complex subComplex(const struct complex z, const struct complex c){
    struct complex t;
    t.re = z.re - c.re;
    t.im = z.im - c.im;
    return recreateComplexFromKarthes(t);
}

struct complex addComplexScalar(const struct complex z, const double n){
    struct complex t;
    t.re = z.re + n;
    return recreateComplexFromKarthes(t);
}

struct complex subComplexScalar(const struct complex z, const double n){
    struct complex t;
    t.re = z.re - n;
    return recreateComplexFromKarthes(t);
}

struct complex multComplexScalar(const struct complex z, const double n) {
    struct complex t;
    t.re = z.re * n;
    t.im = z.im * n;
    return recreateComplexFromKarthes(t);
}

struct complex multComplex(const struct complex z, const struct complex c) {
    struct complex t;
    t.re = z.re*c.re - z.im*c.re;
    t.im = z.re*c.im + z.im*c.re;
    return recreateComplexFromKarthes(t);
}

struct complex divComplex(const struct complex z, const struct complex c) {
    return createComplexFromPolar(z.r / c.r, z.phi - c.phi);
}

__kernel void newtonFraktal(__global const int* res, __global const double* param, __global int* result){
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    const int xRes = res[0];
    const int yRes = res[1];

    struct complex z = createComplexFromKarthes(x - (xRes / 2), y - (yRes / 2));

    struct complex c = createComplexFromKarthes(param[0], param[1]);

    int i = 0;
    while (z.r < 500){
        if (i >= 10000)
            break;
        z = subComplex(z, divComplex(addComplex(addComplex(multComplex(multComplex(z,z),c),multComplex(z,c)),c),addComplex(multComplexScalar(multComplex(z,c),2),c)));
        //z-(c*z*z + c*z + c) / ((c*z) * 2 + c);

        i++;
    }
    result[x + res[0]*y] = i;
}

该内核的目的是为牛顿分形创建图像数据。问题是当param[0]和param[1]相差超过3（这会导致clFinish(-36)）或者它们变得太大时会导致崩溃（这会导致clEnqueueReadBuffer(-36)）。

当我将此代码作为C ++代码运行时，每个参数都完全没问题 - 我知道运行时环境之间存在很大差异。

我在nVidia GeForce GTX 770上运行内核，主机在AMD FX-8350上运行。

我希望你们有一个提示，告诉我如何解决这个问题。我想有没有一步一步的OpenV Debugger for nVidia？

提前感谢您的帮助，
- fodinabor

编辑：好吧 - 似乎我对内核崩溃时的建议并不十分正确 - 他们是我认为我昨天观察的东西......但似乎它完全独立于价值观，因为它们有时候工作，有时他们没有。例如，param[0] = 3;和param[1] = 1;多次表现很好 - 但是atm。我无法运行它.. 所以我想这与我的主人有关..我在下面发布了它。这个问题可能来自一个太大的全球工作规模吗？我的显卡应该可以使用1024*1024*64来保留640*480 ..所以通常它应该有用吗？我尝试用一半尺寸运行它 - ＆gt; 2轮，它总是在第二轮崩溃 - 如果这有帮助..

NewtonFraktalCLGeneration::NewtonFraktalCLGeneration(cl_double* param){
    FILE* f;
    if (fopen_s(&f, "newton.cl", "r") != 0){
        return;
    }
    char* buf = (char*)malloc(100 * sizeof(char));
    char* temp = buf;
    int recv_size = 0, total_recv = 0;
    int i = 1;
    while ((recv_size = fread_s(temp, sizeof(char) * 100, sizeof(char), 100, f)) > 0){
        total_recv += recv_size;
        buf = (char*)realloc(buf, total_recv + 100 * sizeof(char));
        temp = buf + total_recv;
    }
    buf[total_recv] = '\0';

    err = CL_SUCCESS;
    try {
        cl::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);

        cl_context_properties properties[] =
            { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0 };
        cl::Context context(CL_DEVICE_TYPE_GPU, properties);

        cl::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        cl::Program::Sources source(1, std::make_pair(buf,strlen(buf)));
        cl::Program program_ = cl::Program(context, source);
        program_.build(devices);

        cl::Kernel kernel(program_, "newtonFraktal", &err);

        int *res = new int[2];
        res[0] = Services()->getCore()->getXRes(), res[1] = Services()->getCore()->getYRes();
        cl::Buffer resBuf(context, CL_MEM_READ_ONLY, 2 * sizeof(int));
        cl::Buffer paramBuf(context, CL_MEM_READ_ONLY, 2 * sizeof(cl_double));

        result = (cl_int*)calloc(res[0] * res[1], sizeof(cl_int) + 1);
        cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, res[0] * res[1] * sizeof(cl_int) + 1, result);

        cl::CommandQueue queue(context, devices[0], 0, &err);
        cl::Event event;

        queue.enqueueWriteBuffer(resBuf, CL_TRUE, 0, 2 * sizeof(int), res);
        queue.enqueueWriteBuffer(paramBuf, CL_TRUE, 0, 2 * sizeof(double), param);

        kernel.setArg(0, resBuf);
        kernel.setArg(1, paramBuf);
        kernel.setArg(2, outBuf);

        queue.enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            cl::NDRange(res[0], res[1]),
            cl::NullRange,
            NULL,
            &event);

        queue.finish();

        queue.enqueueReadBuffer(outBuf, CL_TRUE, 0, res[0] * res[1] * sizeof(cl_int) + 1, result);
    }
    catch (cl::Error& err) {
        std::cerr
             << "ERROR: "
             << err.what()
             << "("
             << err.err()
             << ")"
             << std::endl;
        this->err = err.err();
    }
}

Answer 1

好的 - 所以现在我真的遇到了问题（我希望：D）：
我的内核在显卡上执行，显示器连接到它...并且执行时间超过5秒。 - ＆GT; TDR（具有Windows驱动程序安全性的东西）重置驱动程序 - ＆gt;内核执行被杀了所以现在我将注册表中的TDRDelay设置为更高的值，并且将来可能会使用另一个图形卡：D

这对我来说有点烦人..但最后我解决了..所以感谢所有这些帮助我解决其他问题的人。

如果有人在此处再次提出此问题，请说明如何增加TDRDelay（不应该用于生产系统;-)）：
打开注册表编辑器（例如，通过Win Key + R并在窗口中编写regedit并执行）浏览到HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers并创建一个名为TdrDelay的新条目您可以选择的值（它间接地是内核的最大执行时间），以秒为单位（我将其设置为10秒）。现在重新启动系统并享受未阻止的OpenCL / CUDA内核：D

OpenCL Kernel使用特定参数崩溃

1 个答案: