如何测量OpenGL中的峰值内存带宽?

时间:2015-06-30 10:59:09

标签: c++ opengl glsl nvidia

为了了解我应该期待什么样的速度,我一直在尝试在全局内存和着色器之间进行基准测试,而不是依赖于GPU规格表。但是我无法接近理论上的最大值。事实上,我已经超出了 50

我正在使用GTX Titan X,即said to have 336.5GB/s。 Linux x64驱动程序352.21。

我找到了一个CUDA基准here,它给了我~240-250GB / s(这是我期望的更多)。

我正在尝试将它们与着色器完全匹配。我尝试了顶点着色器,计算着色器,通过image_load_storeNV_shader_buffer_store访问缓冲区对象,在着色器内部使用float s,vec4 s循环(内部有合并的寻址)工作组)和各种计时方法。我停留在~7GB / s(见下面的更新)。

为什么GL这么慢?我做错了什么,如果是的话,应该怎么做?

这是我的MWE有三种方法(1.顶点着色器与image_load_store,2。顶点着色器与无绑定图形,3。计算着色器与无绑定图形):

//#include <windows.h>
#include <assert.h>
#include <stdio.h>
#include <memory.h>
#include <GL/glew.h>
#include <GL/glut.h>

const char* imageSource =
    "#version 440\n"
    "uniform layout(r32f) imageBuffer data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   imageStore(data, gl_VertexID, vec4(val, 0.0, 0.0, 0.0));\n"
    "   gl_Position = vec4(0.0);\n"
    "}\n";

const char* bindlessSource =
    "#version 440\n"
    "#extension GL_NV_gpu_shader5 : enable\n"
    "#extension GL_NV_shader_buffer_load : enable\n"
    "uniform float* data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   data[gl_VertexID] = val;\n"
    "   gl_Position = vec4(0.0);\n"
    "}\n";

const char* bindlessComputeSource =
    "#version 440\n"
    "#extension GL_NV_gpu_shader5 : enable\n"
    "#extension GL_NV_shader_buffer_load : enable\n"
    "layout(local_size_x = 256) in;\n"
    "uniform float* data;\n"
    "uniform float val;\n"
    "void main() {\n"
    "   data[gl_GlobalInvocationID.x] = val;\n"
    "}\n";

GLuint compile(GLenum type, const char* shaderSrc)
{
    GLuint shader = glCreateShader(type);
    glShaderSource(shader, 1, (const GLchar**)&shaderSrc, NULL);
    glCompileShader(shader);
    int success = 0;
    int loglen = 0;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &success);
    glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &loglen);
    GLchar* log = new GLchar[loglen];
    glGetShaderInfoLog(shader, loglen, &loglen, log);
    if (!success)
    {
        printf("%s\n", log);
        exit(0);
    }
    GLuint program = glCreateProgram();
    glAttachShader(program, shader);
    glLinkProgram(program);
    return program;
}

GLuint timerQueries[2];
void start()
{
    glGenQueries(2, timerQueries);
    glQueryCounter(timerQueries[0], GL_TIMESTAMP);
}

float stop()
{
    glMemoryBarrier(GL_ALL_BARRIER_BITS);
    GLsync sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    glWaitSync(sync, 0, GL_TIMEOUT_IGNORED);
    glQueryCounter(timerQueries[1], GL_TIMESTAMP);
    GLint available = 0;
    while (!available) //sometimes gets stuck here for whatever reason
        glGetQueryObjectiv(timerQueries[1], GL_QUERY_RESULT_AVAILABLE, &available);
    GLuint64 a, b;
    glGetQueryObjectui64v(timerQueries[0], GL_QUERY_RESULT, &a);
    glGetQueryObjectui64v(timerQueries[1], GL_QUERY_RESULT, &b);
    glDeleteQueries(2, timerQueries);
    return b - a;
}

int main(int argc, char** argv)
{
    float* check;
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
    glutCreateWindow("test");
    glewInit();

    int bufferSize = 64 * 1024 * 1024; //64MB
    int loops = 500;

    glEnable(GL_RASTERIZER_DISCARD);

    float* dat = new float[bufferSize/sizeof(float)];
    memset(dat, 0, bufferSize);

    //create a buffer with data
    GLuint buffer;
    glGenBuffers(1, &buffer);
    glBindBuffer(GL_TEXTURE_BUFFER, buffer);
    glBufferData(GL_TEXTURE_BUFFER, bufferSize, NULL, GL_STATIC_DRAW);

    //get a bindless address
    GLuint64 address;
    glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
    glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);

    //make a texture alias for it
    GLuint bufferTexture;
    glGenTextures(1, &bufferTexture);
    glBindTexture(GL_TEXTURE_BUFFER, bufferTexture);
    glTexBuffer(GL_TEXTURE_BUFFER, GL_R32F, buffer);
    glBindImageTextureEXT(0, bufferTexture, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R32F);

    //compile the shaders
    GLuint imageShader = compile(GL_VERTEX_SHADER, imageSource);
    GLuint bindlessShader = compile(GL_VERTEX_SHADER, bindlessSource);
    GLuint bindlessComputeShader = compile(GL_COMPUTE_SHADER, bindlessComputeSource);

    //warm-up and check values
    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(imageShader);
    glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
    glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
    glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f);
    //glUnmapBuffer(GL_TEXTURE_BUFFER);

    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(bindlessShader);
    glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
    glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    //glMemoryBarrier(GL_ALL_BARRIER_BITS); //this causes glDispatchCompute to segfault later, so don't uncomment
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f);
    //glUnmapBuffer(GL_TEXTURE_BUFFER);

    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glUseProgram(bindlessComputeShader);
    glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
    glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
    glMemoryBarrier(GL_ALL_BARRIER_BITS);
    //check = (float*)glMapBuffer(GL_TEXTURE_BUFFER, GL_READ_ONLY);
    //for (int i = 0; i < bufferSize/sizeof(float); ++i)
    //  assert(check[i] == 1.0f); //glDispatchCompute doesn't actually write anything with bindless graphics
    //glUnmapBuffer(GL_TEXTURE_BUFFER);
    glFinish();

    //time image_load_store
    glUseProgram(imageShader);
    glUniform1i(glGetUniformLocation(imageShader, "data"), 0);
    glUniform1f(glGetUniformLocation(imageShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    GLuint64 imageTime = stop();
    printf("image_load_store: %.2fGB/s\n", (float)((bufferSize * (double)loops) / imageTime));

    //time bindless
    glUseProgram(bindlessShader);
    glProgramUniformui64NV(bindlessShader, glGetUniformLocation(bindlessShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDrawArrays(GL_POINTS, 0, bufferSize/sizeof(float));
    GLuint64 bindlessTime = stop();
    printf("bindless: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessTime));

    //time bindless in a compute shader
    glUseProgram(bindlessComputeShader);
    glProgramUniformui64NV(bindlessComputeShader, glGetUniformLocation(bindlessComputeShader, "data"), address);
    glUniform1f(glGetUniformLocation(bindlessComputeShader, "val"), 1.0f);
    start();
    for (int i = 0; i < loops; ++i)
        glDispatchCompute(bufferSize/(sizeof(float) * 256), 1, 1);
    GLuint64 bindlessComputeTime = stop();
    printf("bindless compute: %.2fGB/s\n", (float)((bufferSize * (double)loops) / bindlessComputeTime));
    assert(glGetError() == GL_NO_ERROR);
    return 0;
}

我的输出:

image_load_store: 6.66GB/s
bindless: 6.68GB/s
bindless compute: 6.65GB/s

一些注意事项:

  1. 使用无绑定图形的计算着色器似乎没有写任何内容(注释掉的断言失败),或者至少使用glMapBuffer检索数据,即使速度与其他方法匹配。在计算着色器中使用image_load_store并使顶点着色器具有相同的速度(尽管我认为这是一个过多的排列发布)。
  2. glMemoryBarrier(GL_ALL_BARRIER_BITS)之前调用glDispatchCompute会导致驱动程序崩溃。
  3. 注释掉用于检查输出的三个glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);,将前两个测试的速度提高到17GB / s,计算着色器突然增加到292GB / s,这比我更接近d喜欢,但由于第1点,这不能被信任。
  4. 有时候while (!available)会挂起很长时间(当我厌倦了等待时,ctrl-c会显示它仍在循环中)。
  5. 供参考,这是CUDA代码:

    //http://www.ks.uiuc.edu/Research/vmd/doxygen/CUDABench_8cu-source.html
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <cuda.h>
    
    #define CUERR { cudaError_t err; \
        if ((err = cudaGetLastError()) != cudaSuccess) { \
        printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
        return -1; }}
    
    //
    // GPU device global memory bandwidth benchmark
    //
    template <class T>
    __global__ void gpuglobmemcpybw(T *dest, const T *src) {
        const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
        dest[idx] = src[idx];
    }
    
    template <class T>
    __global__ void gpuglobmemsetbw(T *dest, const T val) {
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        dest[idx] = val;
    }
    
    typedef float4 datatype;
    
    static int cudaglobmembw(int cudadev, double *gpumemsetgbsec, double *gpumemcpygbsec) {
        int i;
        int len = 1 << 22; // one thread per data element
        int loops = 500;
        datatype *src, *dest;
        datatype val=make_float4(1.0f, 1.0f, 1.0f, 1.0f);
    
        // initialize to zero for starters
        float memsettime = 0.0f;
        float memcpytime = 0.0f;
        *gpumemsetgbsec = 0.0;
        *gpumemcpygbsec = 0.0;
    
        // attach to the selected device
        cudaError_t rc;
        rc = cudaSetDevice(cudadev);
        if (rc != cudaSuccess) {
            #if CUDART_VERSION >= 2010
            rc = cudaGetLastError(); // query last error and reset error state
            if (rc != cudaErrorSetOnActiveProcess)
            return -1; // abort and return an error
            #else
            cudaGetLastError(); // just ignore and reset error state, since older CUDA
            // revs don't have a cudaErrorSetOnActiveProcess enum
            #endif
        }
    
        cudaMalloc((void **) &src, sizeof(datatype)*len);
        CUERR
        cudaMalloc((void **) &dest, sizeof(datatype)*len);
        CUERR
    
        dim3 BSz(256, 1, 1);
        dim3 GSz(len / (BSz.x * BSz.y * BSz.z), 1, 1); 
    
        // do a warm-up pass
        gpuglobmemsetbw<datatype><<< GSz, BSz >>>(src, val);
        CUERR
        gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
        CUERR
        gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
        CUERR
    
        cudaEvent_t start, end;
        cudaEventCreate(&start);
        cudaEventCreate(&end);
    
        // execute the memset kernel
        cudaEventRecord(start, 0);
        for (i=0; i<loops; i++) {
        gpuglobmemsetbw<datatype><<< GSz, BSz >>>(dest, val);
        }
        CUERR
        cudaEventRecord(end, 0);
        CUERR
        cudaEventSynchronize(start);
        CUERR
        cudaEventSynchronize(end);
        CUERR
        cudaEventElapsedTime(&memsettime, start, end);
        CUERR
    
        // execute the memcpy kernel
        cudaEventRecord(start, 0);
        for (i=0; i<loops; i++) {
        gpuglobmemcpybw<datatype><<< GSz, BSz >>>(dest, src);
        }
        cudaEventRecord(end, 0);
        CUERR
        cudaEventSynchronize(start);
        CUERR
        cudaEventSynchronize(end);
        CUERR
        cudaEventElapsedTime(&memcpytime, start, end);
        CUERR
    
        cudaEventDestroy(start);
        CUERR
        cudaEventDestroy(end);
        CUERR
    
        *gpumemsetgbsec = (len * sizeof(datatype) / (1024.0 * 1024.0)) / (memsettime / loops);
        *gpumemcpygbsec = (2 * len * sizeof(datatype) / (1024.0 * 1024.0)) / (memcpytime / loops);
        cudaFree(dest);
        cudaFree(src);
        CUERR
    
        return 0;
    }
    
    int main()
    {
        double a, b;
        cudaglobmembw(0, &a, &b);
        printf("%f %f\n", (float)a, (float)b);
        return 0;
    }
    

    更新

    似乎缓冲区在我的glBufferData调用中被非驻留,这些调用正在检查输出是否被写入。根据{{​​3}}:

      

    由于通过BufferData重新指定或被删除,缓冲区也被隐式地非驻留   ...
      BufferData被指定为“删除现有数据存储”,       所以该数据的GPU地址应该变为无效。缓冲区是       因此在当前背景下成为非居民。

    据猜测,OpenGL然后每帧流入缓冲区对象数据,并不将其缓存在视频内存中。这解释了为什么计算着色器使断言失败,但是有一点轻微的异常,顶点着色器中的无绑定图形在不驻留时仍然有效,但我现在将忽略它。我不知道为什么64MB缓冲区对象不会默认为12GB可用时驻留(尽管可能在第一次使用后)。

    因此,在每次调用glBufferData之后,我再次将其设置为驻留并获取地址以防其更改:

    glBufferData(GL_TEXTURE_BUFFER, bufferSize, dat, GL_STATIC_DRAW);
    glMakeBufferResidentNV(GL_TEXTURE_BUFFER, GL_READ_WRITE);
    glGetBufferParameterui64vNV(GL_TEXTURE_BUFFER, GL_BUFFER_GPU_ADDRESS_NV, &address);
    assert(glIsBufferResidentNV(GL_TEXTURE_BUFFER)); //sanity check
    

    我现在使用 image_load_store或无绑定图形使用计算着色器获得270-290GB / s。 现在我的问题包括

    • 鉴于缓冲区似乎是每个测试都驻留的,并且计算着色器很好而且速度快,为什么顶点着色器版本仍然很慢?
    • 如果没有无绑定图形扩展,常规OpenGL用户应该如何将数据放入视频内存(实际上 put 而不是空闲地暗示驱动程序可能只是喜欢)?

      我很确定在实际情况下我会注意到这个问题,并且这个设计的基准测试会遇到一条缓慢的路径,所以我怎么能欺骗驱动程序使缓冲区对象驻留?首先运行计算着色器不会改变任何内容。

1 个答案:

答案 0 :(得分:1)

您要求驱动程序从您的进程内存中读取dat。这会导致广泛的缓存一致性流量。当GPU读取该内存时,它无法确定它是否是最新的,它可能位于CPU缓存中,已修改,并且尚未写回RAM。这导致GPU实际上必须从CPU缓存读取,这比绕过CPU和读取RAM要昂贵得多。 RAM在正常操作期间通常处于空闲状态,因为现代CPU的命中率通常为95%至99%。缓存连续使用。

要获得最佳性能,您需要让驱动程序分配内存。程序使用的正常内存,如全局变量,堆在 writeback 内存中分配。驱动程序分配的内存通常会被分配为写入组合不可缓存,从而消除了一致性流量。

只有在没有缓存一致性开销的情况下才能实现峰值通告带宽数量。

要让驱动程序分配它,请使用glBufferDatanullptr作为数据。

但是,如果你设法强制驱动程序使用系统内存写入组合缓冲区,那么它并不完美。 CPU读取到这些地址将非常慢。顺序写入由CPU优化,但随机写入将导致写入组合缓冲区频繁刷新,从而损害性能。