OpenGL中的错误?相同的浮点计算会导致不同的结果

时间:2017-08-20 09:44:37

标签: opengl glsl

"最小化的例子"很遗憾很长一段时间,因为如果我丢掉了太多不相关的代码,那么这个bug就会消失。

这是C ++代码:

#define _USE_MATH_DEFINES

#include <iostream>
#include <iomanip>
#include <vector>
#include <math.h>
#include <time.h>
#include <fstream>
#include <sstream>

//OpenGL includes
#include <GL/glew.h>                    //OpenGL Extension Wrangler
#include <GL/freeglut.h>                //Free OpenGL Utility Toolkit (includes gl.h and glu.h)

void fPressEnter(){
    std::cout << "Press enter to exit.";
    std::cin.get();
}

void fCompileShaderFromString(const std::string& Source, const GLuint ShaderId){
    const char* Code = Source.c_str();

    glShaderSource(ShaderId, 1, &Code, NULL);

    glCompileShader(ShaderId);

    int Status;
    glGetShaderiv(ShaderId, GL_COMPILE_STATUS, &Status);
    if(Status == GL_FALSE){
        int Length = 0;
        glGetShaderiv(ShaderId, GL_INFO_LOG_LENGTH, &Length);
        if(Length > 0){
            char* Log = new char[Length];
            int Written = 0;
            glGetShaderInfoLog(ShaderId, Length, &Written, Log);
            std::cout << Log << std::endl;
            delete[] Log;
        }
    }
}

void fCompileShaderFromFile(const char* FileName, const GLuint ShaderId){
    std::ifstream InFile(FileName, std::ios::in);
    std::ostringstream Code;

    while(InFile.good()){
        int TI = InFile.get();
        if(!InFile.eof()) Code << (char) TI;
    }
    InFile.close();

    fCompileShaderFromString(Code.str(), ShaderId);
}

void fLinkProgram(const GLuint ProgramId){
    glLinkProgram(ProgramId);

    int Status;
    glGetProgramiv(ProgramId, GL_LINK_STATUS, &Status);
    if(Status == GL_FALSE){
        int Length = 0;
        glGetProgramiv(ProgramId, GL_INFO_LOG_LENGTH, &Length);
        if(Length > 0){
            char* Log = new char[Length];
            int Written = 0;
            glGetProgramInfoLog(ProgramId, Length, &Written, Log);
            std::cout << Log << std::endl;
            delete[] Log;
        }
    }
}

int main(){
    //OpenGL setup

    //Glut
    int argc = 1;
    char* argv[1] = {(char*) ""};
    glutInit(&argc, argv);

    glutInitWindowPosition((glutGet(GLUT_SCREEN_WIDTH) - 256) / 2, (glutGet(GLUT_SCREEN_HEIGHT) - 256) / 2);
    glutInitWindowSize(256, 256);
    glutInitDisplayMode(GLUT_RGB);

    glutCreateWindow("Bug");
    glutHideWindow();

    //Glew
    GLenum glew_ok = glewInit();
    if(glew_ok != GLEW_OK){fprintf(stderr, "Glew error: '%s'\n", glewGetErrorString(glew_ok)); fPressEnter(); return(1);}

    //Main program

    //Auxiliary variables
    GLuint ShaderId, ProgramId, BufferId;
    float* BufferPointer;
    float* Data = new float[32 * 32 * 2];
    float dsup, esup;

    //Compile shader program and create buffer in graphics card memory
    ShaderId = glCreateShader(GL_COMPUTE_SHADER);
    fCompileShaderFromFile("Shader.txt", ShaderId);

    ProgramId = glCreateProgram();
    glAttachShader(ProgramId, ShaderId);
    fLinkProgram(ProgramId);
    glDetachShader(ProgramId, ShaderId);

    glDeleteShader(ShaderId);

    glUseProgram(ProgramId);

    glGenBuffers(1, &BufferId);

    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BufferId);
    glBufferData(GL_SHADER_STORAGE_BUFFER, 32 * 32 * 2 * sizeof(float), NULL, GL_STREAM_READ);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

    //Actual computation
    std::cout << "Starting computation" << std::endl << std::endl << std::flush;

    dsup = 0.f;
    esup = 0.f;

    glUniform1i(0, 0);
    glUniform1i(1, 0);

    glDispatchCompute(4, 4, 1);

    glMemoryBarrier(GL_ALL_BARRIER_BITS);
    glFinish();

    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, BufferId);
    BufferPointer = (float*) glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
    memcpy(Data, BufferPointer, 32 * 32 * 2 * sizeof(float));
    glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

    for(int z2 = 0; z2 < 32; z2++) for(int z3 = 0; z3 < 32; z3++){
        if(Data[z2 * 32 * 2 + z3 * 2 + 0] > dsup) dsup = Data[z2 * 32 * 2 + z3 * 2 + 0];
        if(Data[z2 * 32 * 2 + z3 * 2 + 1] > esup) esup = Data[z2 * 32 * 2 + z3 * 2 + 1];
    }

    std::cout << std::setprecision(8) << dsup << ", " << std::setprecision(8) << esup << std::endl;

    //Cleanup
    delete[] Data;

    glDeleteBuffers(1, &BufferId);

    glDeleteProgram(ProgramId);

    fPressEnter();
    return(0);
}

以下是着色器:

#version 430 core

const uint mg = 2048;
const uint mb = 512;
const uint my = 128;

layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;

layout(binding = 0) buffer OutputBlock{
    float Data[32][32][2];
} Output;

layout(location = 0) uniform int goffset;
layout(location = 1) uniform int boffset;

vec2 fe(float x){return(vec2(cos(6.2831853e0f * x), sin(6.2831853e0f * x)));}
vec2 fComplexTimes(const vec2 A, const vec2 B){return(vec2(A.x * B.x - A.y * B.y, A.x * B.y + A.y * B.x));}
vec2 fComplexTimesI(const vec2 A){return(vec2(-A.y, A.x));}
vec2 fComplexTimesMinusI(const vec2 A){return(vec2(A.y, -A.x));}

float dsup, esup;

void fDESupTaylor_a10_w3_d2(){
    const vec2 one = vec2(1.f, 0.f);
    uint q, zg, zb, zy, n;
    float fq, fb, fg, fys, fymb, fmg, fmb, fatmy;
    float dsum, esum, dsummandsup, esummandsup, dabs, eabs;
    float t;
    vec2 td, te;
    vec2 Di_0_, Di_0_y, Di_0_yy, Di_1_, Di_1_g, Di_1_b, Di_1_y, Di_1_gg, Di_1_gb, Di_1_gy, Di_1_bg, Di_1_bb, Di_1_by, Di_1_yg, Di_1_yb, Di_1_yy, Di_2_, Di_2_g, Di_2_b, Di_2_y, Di_2_gg, Di_2_gb, Di_2_gy, Di_2_bg, Di_2_bb, Di_2_by, Di_2_yg, Di_2_yb, Di_2_yy;
    vec2 Ei_0_, Ei_0_y, Ei_0_yy, Ei_1_, Ei_1_g, Ei_1_b, Ei_1_y, Ei_1_gg, Ei_1_gb, Ei_1_gy, Ei_1_bg, Ei_1_bb, Ei_1_by, Ei_1_yg, Ei_1_yb, Ei_1_yy, Ei_2_, Ei_2_g, Ei_2_b, Ei_2_y, Ei_2_gg, Ei_2_gb, Ei_2_gy, Ei_2_bg, Ei_2_bb, Ei_2_by, Ei_2_yg, Ei_2_yb, Ei_2_yy;
    vec2 D_2_, D_2_g, D_2_b, D_2_y, D_2_gg, D_2_gb, D_2_gy, D_2_bg, D_2_bb, D_2_by, D_2_yg, D_2_yb, D_2_yy, D_3_, D_3_g, D_3_b, D_3_y, D_3_gg, D_3_gb, D_3_gy, D_3_bg, D_3_bb, D_3_by, D_3_yg, D_3_yb, D_3_yy;
    vec2 E_2_, E_2_g, E_2_b, E_2_y, E_2_gg, E_2_gb, E_2_gy, E_2_bg, E_2_bb, E_2_by, E_2_yg, E_2_yb, E_2_yy, E_3_, E_3_g, E_3_b, E_3_y, E_3_gg, E_3_gb, E_3_gy, E_3_bg, E_3_bb, E_3_by, E_3_yg, E_3_yb, E_3_yy;
    float R_g, R_b, R_y, R_gg, R_gb, R_gy, R_bg, R_bb, R_by, R_yg, R_yb, R_yy;

    fmg = float(mg);
    fmb = float(mb);
    fatmy = 10.f * float(my);

    R_g = 2.f * fmg;
    R_b = 2.f * fmb;
    R_y = 2.f * fatmy;
    R_gg = 8.f * pow(fmg, 2.f);
    R_gb = 8.f * fmg * fmb;
    R_gy = 8.f * fmg * fatmy;
    R_bg = 8.f * fmg * fmb;
    R_bb = 8.f * pow(fmb, 2.f);
    R_by = 8.f * fmb * fatmy;
    R_yg = 8.f * fmg * fatmy;
    R_yb = 8.f * fmb * fatmy;
    R_yy = 8.f * pow(fatmy, 2.f);

    dsup = 0.f;
    esup = 0.f;

    zg = goffset + gl_GlobalInvocationID.x;
    fg = float(zg) / fmg;

    zb = boffset + gl_GlobalInvocationID.y;
    fb = float(zb) / fmb;

    for(q = 0; q < 10; q++){
        fq = float(q);

        dsum = 0.f;
        esum = 0.f;

        for(n = 0; n <= 1031; n++){
            dsummandsup = 0.f;
            esummandsup = 0.f;

            fys = float(n) / 10.f;

            for(zy = 0; zy <= my; zy++){
                fymb = fys + float(zy) / fatmy - fb;

                t = fb + fymb;

                Di_0_ = one + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);
                Ei_0_ = fe(10.f * t);

                td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_0_y = 6.2831853e0f * td; Ei_0_y = 6.2831853e1f * te;

                td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_0_yy = 3.9478418e1f * td; Ei_0_yy = 3.9478418e3f * te;

                t = fg + fb + 9.9019514e-2f * fymb;

                Di_1_ = one + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);
                Ei_1_ = fe(10.f * t);

                td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_1_g = 6.2831853e0f * td; Ei_1_g = 6.2831853e1f * te;
                Di_1_b = 5.6610274e0f * td; Ei_1_b = 5.6610274e1f * te;
                Di_1_y = 6.2215795e-1f * td; Ei_1_y = 6.2215795e0f * te;

                td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_1_gg = 3.9478418e1f * td; Ei_1_gg = 3.9478418e3f * te;
                Di_1_gb = 3.5569284e1f * td; Ei_1_gb = 3.5569284e3f * te;
                Di_1_gy = 3.9091337e0f * td; Ei_1_gy = 3.9091337e2f * te;
                Di_1_bg = 3.5569284e1f * td; Ei_1_bg = 3.5569284e3f * te;
                Di_1_bb = 3.2047231e1f * td; Ei_1_bb = 3.2047231e3f * te;
                Di_1_by = 3.5220532e0f * td; Ei_1_by = 3.5220532e2f * te;
                Di_1_yg = 3.9091337e0f * td; Ei_1_yg = 3.9091337e2f * te;
                Di_1_yb = 3.5220532e0f * td; Ei_1_yb = 3.5220532e2f * te;
                Di_1_yy = 3.8708052e-1f * td; Ei_1_yy = 3.8708052e1f * te;

                t = -10.f * fg + fb + 9.8048641e-3f * fymb;

                Di_2_ = one + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);
                Ei_2_ = fe(10.f * t);

                td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_2_g = -6.2831853e1f * td; Ei_2_g = -6.2831853e2f * te;
                Di_2_b = 6.2215795e0f * td; Ei_2_b = 6.2215795e1f * te;
                Di_2_y = 6.1605778e-2f * td; Ei_2_y = 6.1605778e-1f * te;

                td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_2_gg = 3.9478418e3f * td; Ei_2_gg = 3.9478418e5f * te;
                Di_2_gb = -3.9091337e2f * td; Ei_2_gb = -3.9091337e4f * te;
                Di_2_gy = -3.8708052e0f * td; Ei_2_gy = -3.8708052e2f * te;
                Di_2_bg = -3.9091337e2f * td; Ei_2_bg = -3.9091337e4f * te;
                Di_2_bb = 3.8708052e1f * td; Ei_2_bb = 3.8708052e3f * te;
                Di_2_by = 3.8328525e-1f * td; Ei_2_by = 3.8328525e1f * te;
                Di_2_yg = -3.8708052e0f * td; Ei_2_yg = -3.8708052e2f * te;
                Di_2_yb = 3.8328525e-1f * td; Ei_2_yb = 3.8328525e1f * te;
                Di_2_yy = 3.7952719e-3f * td; Ei_2_yy = 3.7952719e-1f * te;

                D_2_ = fComplexTimes(Di_0_, Di_1_) + Ei_0_;
                D_2_g = fComplexTimes(Di_0_, Di_1_g);
                D_2_b = fComplexTimes(Di_0_, Di_1_b);
                D_2_y = fComplexTimes(Di_0_y, Di_1_) + fComplexTimes(Di_0_, Di_1_y) + Ei_0_y;
                D_2_gg = fComplexTimes(Di_0_, Di_1_gg);
                D_2_gb = fComplexTimes(Di_0_, Di_1_gb);
                D_2_gy = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_gy);
                D_2_bg = fComplexTimes(Di_0_, Di_1_bg);
                D_2_bb = fComplexTimes(Di_0_, Di_1_bb);
                D_2_by = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_by);
                D_2_yg = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_yg);
                D_2_yb = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_yb);
                D_2_yy = fComplexTimes(Di_0_yy, Di_1_) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_, Di_1_yy) + Ei_0_yy;

                E_2_ = fComplexTimes(Di_0_, Ei_1_);
                E_2_g = fComplexTimes(Di_0_, Ei_1_g);
                E_2_b = fComplexTimes(Di_0_, Ei_1_b);
                E_2_y = fComplexTimes(Di_0_y, Ei_1_) + fComplexTimes(Di_0_, Ei_1_y);
                E_2_gg = fComplexTimes(Di_0_, Ei_1_gg);
                E_2_gb = fComplexTimes(Di_0_, Ei_1_gb);
                E_2_gy = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_gy);
                E_2_bg = fComplexTimes(Di_0_, Ei_1_bg);
                E_2_bb = fComplexTimes(Di_0_, Ei_1_bb);
                E_2_by = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_by);
                E_2_yg = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_yg);
                E_2_yb = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_yb);
                E_2_yy = fComplexTimes(Di_0_yy, Ei_1_) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_, Ei_1_yy);

                D_3_ = fComplexTimes(D_2_, Di_2_) + E_2_;
                D_3_g = fComplexTimes(D_2_g, Di_2_) + fComplexTimes(D_2_, Di_2_g) + E_2_g;
                D_3_b = fComplexTimes(D_2_b, Di_2_) + fComplexTimes(D_2_, Di_2_b) + E_2_b;
                D_3_y = fComplexTimes(D_2_y, Di_2_) + fComplexTimes(D_2_, Di_2_y) + E_2_y;
                D_3_gg = fComplexTimes(D_2_gg, Di_2_) + fComplexTimes(D_2_g, Di_2_g) + fComplexTimes(D_2_g, Di_2_g) + fComplexTimes(D_2_, Di_2_gg) + E_2_gg;
                D_3_gb = fComplexTimes(D_2_gb, Di_2_) + fComplexTimes(D_2_g, Di_2_b) + fComplexTimes(D_2_b, Di_2_g) + fComplexTimes(D_2_, Di_2_gb) + E_2_gb;
                D_3_gy = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;
                D_3_bg = fComplexTimes(D_2_bg, Di_2_) + fComplexTimes(D_2_b, Di_2_g) + fComplexTimes(D_2_g, Di_2_b) + fComplexTimes(D_2_, Di_2_bg) + E_2_bg;
                D_3_bb = fComplexTimes(D_2_bb, Di_2_) + fComplexTimes(D_2_b, Di_2_b) + fComplexTimes(D_2_b, Di_2_b) + fComplexTimes(D_2_, Di_2_bb) + E_2_bb;
                D_3_by = fComplexTimes(D_2_by, Di_2_) + fComplexTimes(D_2_b, Di_2_y) + fComplexTimes(D_2_y, Di_2_b) + fComplexTimes(D_2_, Di_2_by) + E_2_by;
                //The bug occurs here: Use one of the two following semantically identical lines to get different results
                D_3_yg = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;
                //D_3_yg = D_3_gy;
                D_3_yb = fComplexTimes(D_2_yb, Di_2_) + fComplexTimes(D_2_y, Di_2_b) + fComplexTimes(D_2_b, Di_2_y) + fComplexTimes(D_2_, Di_2_yb) + E_2_yb;
                D_3_yy = fComplexTimes(D_2_yy, Di_2_) + fComplexTimes(D_2_y, Di_2_y) + fComplexTimes(D_2_y, Di_2_y) + fComplexTimes(D_2_, Di_2_yy) + E_2_yy;

                dabs = length(D_3_) + length(D_3_g) / R_g + length(D_3_b) / R_b + length(D_3_y) / R_y + length(D_3_gg) / R_gg + length(D_3_gb) / R_gb + length(D_3_gy) / R_gy + length(D_3_bg) / R_bg + length(D_3_bb) / R_bb + length(D_3_by) / R_by + length(D_3_yg) / R_yg + length(D_3_yb) / R_yb + length(D_3_yy) / R_yy;
                dsummandsup = max(dsummandsup, dabs);

                E_3_ = fComplexTimes(D_2_, Ei_2_);
                E_3_g = fComplexTimes(D_2_g, Ei_2_) + fComplexTimes(D_2_, Ei_2_g);
                E_3_b = fComplexTimes(D_2_b, Ei_2_) + fComplexTimes(D_2_, Ei_2_b);
                E_3_y = fComplexTimes(D_2_y, Ei_2_) + fComplexTimes(D_2_, Ei_2_y);
                E_3_gg = fComplexTimes(D_2_gg, Ei_2_) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gg);
                E_3_gb = fComplexTimes(D_2_gb, Ei_2_) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gb);
                E_3_gy = fComplexTimes(D_2_gy, Ei_2_) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gy);
                E_3_bg = fComplexTimes(D_2_bg, Ei_2_) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bg);
                E_3_bb = fComplexTimes(D_2_bb, Ei_2_) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bb);
                E_3_by = fComplexTimes(D_2_by, Ei_2_) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_, Ei_2_by);
                E_3_yg = fComplexTimes(D_2_yg, Ei_2_) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yg);
                E_3_yb = fComplexTimes(D_2_yb, Ei_2_) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yb);
                E_3_yy = fComplexTimes(D_2_yy, Ei_2_) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yy);

                eabs = length(E_3_) + length(E_3_g) / R_g + length(E_3_b) / R_b + length(E_3_y) / R_y + length(E_3_gg) / R_gg + length(E_3_gb) / R_gb + length(E_3_gy) / R_gy + length(E_3_bg) / R_bg + length(E_3_bb) / R_bb + length(E_3_by) / R_by + length(E_3_yg) / R_yg + length(E_3_yb) / R_yb + length(E_3_yy) / R_yy;
                esummandsup = max(esummandsup, eabs);
            }

            dsum += dsummandsup;
            esum += esummandsup;
        }
        for(n = 1032; n <= 10402; n++){
            dsummandsup = 0.f;
            esummandsup = 0.f;

            fys = float(n) / 10.f;

            for(zy = 0; zy <= my; zy++){
                fymb = fys + float(zy) / fatmy - fb;

                t = fb + fymb;

                Di_0_ = one + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);
                Ei_0_ = fe(10.f * t);

                td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_0_y = 6.2831853e0f * td; Ei_0_y = 6.2831853e1f * te;

                td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_0_yy = 3.9478418e1f * td; Ei_0_yy = 3.9478418e3f * te;

                t = fg + fb + 9.9019514e-2f * fymb;

                Di_1_ = one + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);
                Ei_1_ = fe(10.f * t);

                td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_1_g = 6.2831853e0f * td; Ei_1_g = 6.2831853e1f * te;
                Di_1_b = 5.6610274e0f * td; Ei_1_b = 5.6610274e1f * te;
                Di_1_y = 6.2215795e-1f * td; Ei_1_y = 6.2215795e0f * te;

                td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_1_gg = 3.9478418e1f * td; Ei_1_gg = 3.9478418e3f * te;
                Di_1_gb = 3.5569284e1f * td; Ei_1_gb = 3.5569284e3f * te;
                Di_1_gy = 3.9091337e0f * td; Ei_1_gy = 3.9091337e2f * te;
                Di_1_bg = 3.5569284e1f * td; Ei_1_bg = 3.5569284e3f * te;
                Di_1_bb = 3.2047231e1f * td; Ei_1_bb = 3.2047231e3f * te;
                Di_1_by = 3.5220532e0f * td; Ei_1_by = 3.5220532e2f * te;
                Di_1_yg = 3.9091337e0f * td; Ei_1_yg = 3.9091337e2f * te;
                Di_1_yb = 3.5220532e0f * td; Ei_1_yb = 3.5220532e2f * te;
                Di_1_yy = 3.8708052e-1f * td; Ei_1_yy = 3.8708052e1f * te;

                t = -10.f * fg + fb + 9.8048641e-3f * fymb;

                Di_2_ = one + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);
                Ei_2_ = fe(10.f * t);

                td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fComplexTimesI(td);
                te = fe(10.f * t); te = fComplexTimesI(te);
                Di_2_g = -6.2831853e1f * td; Ei_2_g = -6.2831853e2f * te;
                Di_2_b = 6.2215795e0f * td; Ei_2_b = 6.2215795e1f * te;
                Di_2_y = 6.1605778e-2f * td; Ei_2_y = 6.1605778e-1f * te;

                td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;
                te = fe(10.f * t); te = -te;
                Di_2_gg = 3.9478418e3f * td; Ei_2_gg = 3.9478418e5f * te;
                Di_2_gb = -3.9091337e2f * td; Ei_2_gb = -3.9091337e4f * te;
                Di_2_gy = -3.8708052e0f * td; Ei_2_gy = -3.8708052e2f * te;
                Di_2_bg = -3.9091337e2f * td; Ei_2_bg = -3.9091337e4f * te;
                Di_2_bb = 3.8708052e1f * td; Ei_2_bb = 3.8708052e3f * te;
                Di_2_by = 3.8328525e-1f * td; Ei_2_by = 3.8328525e1f * te;
                Di_2_yg = -3.8708052e0f * td; Ei_2_yg = -3.8708052e2f * te;
                Di_2_yb = 3.8328525e-1f * td; Ei_2_yb = 3.8328525e1f * te;
                Di_2_yy = 3.7952719e-3f * td; Ei_2_yy = 3.7952719e-1f * te;

                D_2_ = fComplexTimes(Di_0_, Di_1_) + Ei_0_;
                D_2_g = fComplexTimes(Di_0_, Di_1_g);
                D_2_b = fComplexTimes(Di_0_, Di_1_b);
                D_2_y = fComplexTimes(Di_0_y, Di_1_) + fComplexTimes(Di_0_, Di_1_y) + Ei_0_y;
                D_2_gg = fComplexTimes(Di_0_, Di_1_gg);
                D_2_gb = fComplexTimes(Di_0_, Di_1_gb);
                D_2_gy = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_gy);
                D_2_bg = fComplexTimes(Di_0_, Di_1_bg);
                D_2_bb = fComplexTimes(Di_0_, Di_1_bb);
                D_2_by = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_by);
                D_2_yg = fComplexTimes(Di_0_y, Di_1_g) + fComplexTimes(Di_0_, Di_1_yg);
                D_2_yb = fComplexTimes(Di_0_y, Di_1_b) + fComplexTimes(Di_0_, Di_1_yb);
                D_2_yy = fComplexTimes(Di_0_yy, Di_1_) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_y, Di_1_y) + fComplexTimes(Di_0_, Di_1_yy) + Ei_0_yy;

                E_2_ = fComplexTimes(Di_0_, Ei_1_);
                E_2_g = fComplexTimes(Di_0_, Ei_1_g);
                E_2_b = fComplexTimes(Di_0_, Ei_1_b);
                E_2_y = fComplexTimes(Di_0_y, Ei_1_) + fComplexTimes(Di_0_, Ei_1_y);
                E_2_gg = fComplexTimes(Di_0_, Ei_1_gg);
                E_2_gb = fComplexTimes(Di_0_, Ei_1_gb);
                E_2_gy = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_gy);
                E_2_bg = fComplexTimes(Di_0_, Ei_1_bg);
                E_2_bb = fComplexTimes(Di_0_, Ei_1_bb);
                E_2_by = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_by);
                E_2_yg = fComplexTimes(Di_0_y, Ei_1_g) + fComplexTimes(Di_0_, Ei_1_yg);
                E_2_yb = fComplexTimes(Di_0_y, Ei_1_b) + fComplexTimes(Di_0_, Ei_1_yb);
                E_2_yy = fComplexTimes(Di_0_yy, Ei_1_) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_y, Ei_1_y) + fComplexTimes(Di_0_, Ei_1_yy);

                E_3_ = fComplexTimes(D_2_, Ei_2_);
                E_3_g = fComplexTimes(D_2_g, Ei_2_) + fComplexTimes(D_2_, Ei_2_g);
                E_3_b = fComplexTimes(D_2_b, Ei_2_) + fComplexTimes(D_2_, Ei_2_b);
                E_3_y = fComplexTimes(D_2_y, Ei_2_) + fComplexTimes(D_2_, Ei_2_y);
                E_3_gg = fComplexTimes(D_2_gg, Ei_2_) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gg);
                E_3_gb = fComplexTimes(D_2_gb, Ei_2_) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gb);
                E_3_gy = fComplexTimes(D_2_gy, Ei_2_) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_, Ei_2_gy);
                E_3_bg = fComplexTimes(D_2_bg, Ei_2_) + fComplexTimes(D_2_b, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bg);
                E_3_bb = fComplexTimes(D_2_bb, Ei_2_) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_b) + fComplexTimes(D_2_, Ei_2_bb);
                E_3_by = fComplexTimes(D_2_by, Ei_2_) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_, Ei_2_by);
                E_3_yg = fComplexTimes(D_2_yg, Ei_2_) + fComplexTimes(D_2_y, Ei_2_g) + fComplexTimes(D_2_g, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yg);
                E_3_yb = fComplexTimes(D_2_yb, Ei_2_) + fComplexTimes(D_2_y, Ei_2_b) + fComplexTimes(D_2_b, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yb);
                E_3_yy = fComplexTimes(D_2_yy, Ei_2_) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_y, Ei_2_y) + fComplexTimes(D_2_, Ei_2_yy);

                eabs = length(E_3_) + length(E_3_g) / R_g + length(E_3_b) / R_b + length(E_3_y) / R_y + length(E_3_gg) / R_gg + length(E_3_gb) / R_gb + length(E_3_gy) / R_gy + length(E_3_bg) / R_bg + length(E_3_bb) / R_bb + length(E_3_by) / R_by + length(E_3_yg) / R_yg + length(E_3_yb) / R_yb + length(E_3_yy) / R_yy;
                esummandsup = max(esummandsup, eabs);
            }

            dsum += dsummandsup;
            esum += esummandsup;
        }

        dsup = max(dsup, dsum);
        esup = max(esup, esum);
    }
}

void main(void){
    fDESupTaylor_a10_w3_d2();

    Output.Data[gl_GlobalInvocationID.x][gl_GlobalInvocationID.y][0] = dsup;
    Output.Data[gl_GlobalInvocationID.x][gl_GlobalInvocationID.y][1] = esup;
}

如果我运行上面的程序并使用着色器中我用注释标记的两条相同的行中的一行,我会得到值esup(第二个打印值)的不同结果。如果我使用第一行

D_3_yg = fComplexTimes(D_2_gy, Di_2_) + fComplexTimes(D_2_g, Di_2_y) + fComplexTimes(D_2_y, Di_2_g) + fComplexTimes(D_2_, Di_2_gy) + E_2_gy;

我得到esup = 85930.453,如果我使用

D_3_yg = D_3_gy;

(请注意,这应该给出相同的结果)我得到esup = 85928.164。

另外,如果我改变了行

zg = goffset + gl_GlobalInvocationID.x;
fg = float(zg) / fmg;

zb = boffset + gl_GlobalInvocationID.y;
fb = float(zb) / fmb;

zg = gl_GlobalInvocationID.x;
fg = float(zg) / fmg;

zb = gl_GlobalInvocationID.y;
fb = float(zb) / fmb;

(请注意,goffset和boffset在C ++中设置为0)我得到了另一个结果(当我分别使用第一行或第二行时,esup = 85929.844和esup = 85929.742)。

另一个非常奇怪的方面是&#34;完整的错误&#34;只有当常量my设置为128时才会出现。对于我的其他值,例如64,127,129或256,更改两行不会改变esup的值,但仍然会删除goffset和boffset。

此外,D_3_yg的值(由两条不同的行设置)实际上对esup的计算没有任何作用,但仅适用于dsup。但是dsup保持不变并且esup会发生变化。

我正在使用Nvidia Quadro M2000M和Visual Studio 2012进行编译。任何想法可能是什么问题?

请注意,程序运行大约需要20秒,屏幕会在此期间冻结。在Windows中,您需要将TdrDelay增加到60(秒)

https://docs.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys

因为Windows杀死了GPU计算,默认情况下耗时超过2秒。

1 个答案:

答案 0 :(得分:1)

你的着色器做了很多浮点运算。任何行都与之前的结果一致。 精度误差累积

此外,GPU可以以更高的精度进行计算;但每次存储结果时都会被截断。有时候编译器可以优化它,有时候它可以优化它。

你应该阅读
What Every Programmer Should Know About Floating-Point Arithmetic

What Every Computer Scientist Should Know About Floating-Point Arithmetic

一些特殊的,恶劣的条件案件的问题会增加。一个简单的几何类似问题是确定两条几乎平行的线之间的交点。结果可能会发生很大变化。

您应该尝试简化那些长时间的计算。使用双精度类型(自OPenGL 4.0起可用)可能有所帮助,但可能还不够。