OpenGL glDrawArraysInstanced比glDrawElementsInstanced更快:有没有办法使glDrawElementsInstanced更快?

时间:2019-07-14 05:50:46

标签: c++ opengl sdl geometry-instancing

我正在使用OpenGL实例化方法进行测试,并且测试了glDrawArraysInstanced和glDrawElementsInstanced函数,其余代码几乎相同。测试是渲染100万个彩色正方形(缩放为像素并适合屏幕大小),它们也可以旋转和平移,并观察FPS差异。 我得到的结果是:

  • 使用glDrawArraysInstanced:36-39 FPS
  • 使用glDrawElementsInstanced:24-28 FPS

根据我的研究,我没有得出哪个函数应该更快的结论,但是我更喜欢使用glDrawElementsInstanced。

所以我的问题是在这种情况下(或一般而言)是否有一种改进glDrawElementsInstanced的方法,以使使用它的绘图等效于或快于glDrawArraysInstanced。 这是我用于glDrawArraysInstanced的代码:

int main(int argc, char** argv) {

int width = 1000, height = 600;
SDLWindow window(width, height, "window");
const char* glsl_version = "#version 330";

srand((unsigned int)time(NULL));

Shader shader("shaders/shader.vert", "shaders/shader.frag");

float quadVertices[] = {
    // positions     
    -1.0f,  1.0f,
     1.0f, -1.0f,
    -1.0f, -1.0f,

    -1.0f,  1.0f,
     1.0f, -1.0f,
     1.0f,  1.0f
};

float colors[] = {
    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 0.0f, 1.0f,

    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 1.0f, 1.0f
};

// Gera VAO e buffers
GLuint VBO, VAO, TRANSFORM;

unsigned int amount = 1000000;

glm::mat4 *modelMatrices;
modelMatrices = new glm::mat4[amount];

GLsizei vec4Size = sizeof(glm::vec4);

glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glGenBuffers(1, &TRANSFORM);

glBindVertexArray(VAO);

// configuração de VBO
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(quadVertices) + sizeof(colors), nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(quadVertices), quadVertices);
glBufferSubData(GL_ARRAY_BUFFER, sizeof(quadVertices), sizeof(colors), colors);

glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), (void*)(sizeof(quadVertices)));

glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);

// configuração de TRANSFORM
glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
glBufferData(GL_ARRAY_BUFFER, amount * sizeof(glm::vec2) + amount * sizeof(glm::vec2) + amount * sizeof(glm::vec4), nullptr, GL_STREAM_DRAW);
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)0);
glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)(sizeof(glm::vec2) * amount));
glVertexAttribPointer(4, 4, GL_FLOAT, GL_FALSE, sizeof(glm::vec4), (void*)(sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount));

glEnableVertexAttribArray(2);
glEnableVertexAttribArray(3);
glEnableVertexAttribArray(4);
glVertexAttribDivisor(2, 1);
glVertexAttribDivisor(3, 1);
glVertexAttribDivisor(4, 1);

glBindVertexArray(0);

MathUtils mu;

vector <float> randNumbers;

float amountSqrt = glm::sqrt(amount);

for (int i = 0; i < amount; i++)
{
    randNumbers.push_back(mu.GenRandFloat(-1.0f, 1.0f) * amountSqrt);
}
glm::vec2 *scale = new glm::vec2[amount];
glm::vec2 *translate = new glm::vec2[amount];
glm::vec4 *rotate = new glm::vec4[amount];

void *transformBfrPtr = nullptr;

bool quit = 0;
while (!quit)
{
    glClearColor(1.0, 1.0, 1.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT);

    shader.use();

    glBindVertexArray(VAO);

    glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, 0, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
    for (int i = 0; i < amount; i++)
    {
        scale[i][0] = 1.0f / amountSqrt;
        scale[i][1] = 1.0f / amountSqrt;
    }

    std::memcpy(transformBfrPtr, scale, sizeof(scale[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    for (int i = 0; i < amount; i++)
    {
        translate[i][0] = randNumbers[i];
        translate[i][1] = randNumbers[amount - i];
    }

    std::memcpy(transformBfrPtr, translate, sizeof(translate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount, sizeof(glm::vec4) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    float var = SDL_GetTicks() / 1000.0f;
    for (int i = 0; i < amount; i++)
    {
        rotate[i][0] = cos(var);
        rotate[i][1] = sin(var);
        rotate[i][2] = -sin(var);
        rotate[i][3] = cos(var);
    }

    std::memcpy(transformBfrPtr, rotate, sizeof(rotate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);

    glDrawArraysInstanced(GL_TRIANGLES, 0, 6, amount);

    switch (window.SDL_ManageEvent())
    {
    case SDL_QUIT:
        SDL_Quit();
        glDeleteBuffers(1, &VBO);
        glDeleteVertexArrays(1, &VAO);
        break;
    }

    window.SDL_SwapWindow();
}


return 0;
}

以及使用glDrawElementsInstanced的代码(几乎相同,但带有索引):

int main(int argc, char** argv) {

int width = 1000, height = 600;
SDLWindow window(width, height, "window");
const char* glsl_version = "#version 330";

srand((unsigned int)time(NULL));

Shader shader("shaders/shader.vert", "shaders/shader.frag");

float quadVertices[] = {
    // positions     
    -1.0f,  1.0f,
     1.0f, -1.0f,
    -1.0f, -1.0f,
     1.0f,  1.0f
};

float colors[] = {
    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 0.0f, 1.0f,
    0.0f, 1.0f, 1.0f
};

unsigned int indices[] = {
    0, 1, 2,
    0, 1, 3
};

// Gera VAO e buffers
GLuint VBO, VAO, TRANSFORM, EBO;

unsigned int amount = 1000000;

glm::mat4 *modelMatrices;
modelMatrices = new glm::mat4[amount];

GLsizei vec4Size = sizeof(glm::vec4);

glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glGenBuffers(1, &TRANSFORM);
glGenBuffers(1, &EBO);

glBindVertexArray(VAO);

// configuração de VBO
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(quadVertices) + sizeof(colors), nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(quadVertices), quadVertices);
glBufferSubData(GL_ARRAY_BUFFER, sizeof(quadVertices), sizeof(colors), colors);

glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), (void*)(sizeof(quadVertices)));

glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);

// configuração de EBO
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STREAM_DRAW);

// configuração de TRANSFORM
glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
glBufferData(GL_ARRAY_BUFFER, amount * sizeof(glm::vec2) + amount * sizeof(glm::vec2) + amount * sizeof(glm::vec4), nullptr, GL_STREAM_DRAW);
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)0);
glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)(sizeof(glm::vec2) * amount));
glVertexAttribPointer(4, 4, GL_FLOAT, GL_FALSE, sizeof(glm::vec4), (void*)(sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount));

glEnableVertexAttribArray(2);
glEnableVertexAttribArray(3);
glEnableVertexAttribArray(4);
glVertexAttribDivisor(2, 1);
glVertexAttribDivisor(3, 1);
glVertexAttribDivisor(4, 1);

glBindVertexArray(0);

glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);

MathUtils mu;

vector <float> randNumbers;

float amountSqrt = glm::sqrt(amount);

for (int i = 0; i < amount; i++)
{
    randNumbers.push_back(mu.GenRandFloat(-1.0f, 1.0f) * amountSqrt);
}
glm::vec2 *scale = new glm::vec2[amount];
glm::vec2 *translate = new glm::vec2[amount];
glm::vec4 *rotate = new glm::vec4[amount];

void *transformBfrPtr = nullptr;

bool quit = 0;
while (!quit)
{
    glClearColor(1.0, 1.0, 1.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT);

    shader.use();

    glBindVertexArray(VAO);

    glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, 0, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
    for (int i = 0; i < amount; i++)
    {
        scale[i][0] = 1.0f / amountSqrt;
        scale[i][1] = 1.0f / amountSqrt;
    }

    std::memcpy(transformBfrPtr, scale, sizeof(scale[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    for (int i = 0; i < amount; i++)
    {
        translate[i][0] = randNumbers[i];
        translate[i][1] = randNumbers[amount - i];
    }

    std::memcpy(transformBfrPtr, translate, sizeof(translate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount, sizeof(glm::vec4) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    float var = SDL_GetTicks() / 1000.0f;
    for (int i = 0; i < amount; i++)
    {
        rotate[i][0] = cos(var);
        rotate[i][1] = sin(var);
        rotate[i][2] = -sin(var);
        rotate[i][3] = cos(var);
    }

    std::memcpy(transformBfrPtr, rotate, sizeof(rotate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);

    glDrawElementsInstanced(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0, amount);

    switch (window.SDL_ManageEvent())
    {
    case SDL_QUIT:
        SDL_Quit();
        glDeleteBuffers(1, &VBO);
        glDeleteVertexArrays(1, &VAO);
        break;
    }

    window.SDL_SwapWindow();
}


return 0;
}

顶点着色器代码是否有帮助:

#version 330 core
layout (location = 0) in vec2 aPos;
layout (location = 1) in vec3 aColor;
layout (location = 2) in vec2 scale;
layout (location = 3) in vec2 translation;
layout (location = 4) in vec4 rotation;

out vec3 color;

void main()
{
    mat2 rotation = mat2(rotation.xy, rotation.zw);
    gl_Position =  vec4(((aPos * rotation + translation) *  scale) , 0.0, 1.0);

    color = aColor;
}  

谢谢。

更新: 好的,看来我有一个与此无关的问题。我在另一台机器上测试了这两个代码,并对结果感到惊讶。首先介绍一下背景: 上面的测试是在此(PC)设置上进行的: 16GB内存,i7 4790k,GTX 970。

然后,我使用此设置在笔记本电脑上进行了相同的测试: 8GB内存,i5 8250u,Geforce MX150(2GB)

结果:

  • 使用glDrawArraysInstanced:73-87 FPS
  • 使用glDrawElementsInstanced:74-87 FPS

我还注意到,在笔记本电脑中,GPU的使用率一直保持在99%左右,而在我的PC中,glDrawElementsInstanced和glDrawArraysInstanced的摆动率约为60%。在这两种情况下,所有内核的CPU使用率都在60%以下(因此CPU瓶颈毫无疑问)。 我试图卸载GPU驱动程序,并使用DDU进行干净的重新安装,但此操作无效。 我还检查了两个系统中的Nvidia控制程序,设置完全相同。我在两台机器上都安装了Windows 10,并且将它们设置为在Windows控制痛苦中获得最大性能。 我的第一个问题得到了部分回答,因为我在不同的机器上有非常不同的FPS结果(在绘制调用函数本身之间)。 这样的差异怎么可能?不客气。

0 个答案:

没有答案