CUDA计划的工作速度不如预期

时间:2017-04-18 21:25:24

标签: c++ cuda

我一直在尝试使用CUDA / C ++,并决定制作一个n身体模拟器。它模拟了4096个粒子之间的引力吸引力。它运行大约2或3 FPS,我不完全确定原因。使用的显卡是GTX 980 Ti,所以我希望程序能够顺利运行。我知道它可能没有针对最好的能力进行优化,但我不会期望它运行得这么慢。

代码只应该是原型,所以代码不能以任何方式整洁(或正确编写)。

main.cu

#include <Windows.h>
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include "Particle.h"
#include <cuda_runtime.h>

#include <device_launch_parameters.h>
#include <ctime>
#include <string>

#define N 4096
#define DT 0.00001
# define M_PI           3.14159265358979323846  /* pi */

using namespace std;

Particle p[N];

int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;

__global__ void updateParticle(Particle* out, Particle *pin)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    double velx = 0;
    double vely = 0;
    out[i].mass = pin[i].mass;
    for(int j = 0; j < N; j++)
    {
        if (i == j || pin[j].mass == 0 || pin[i].mass == 0)
            continue;
        double difx = pin[i].posx - pin[j].posx;
        double dify = pin[i].posy - pin[j].posy;
        double len = difx * difx + dify * dify;
        if (len == 0)
            continue;
        double force = (pin[i].mass * pin[j].mass) / len;
        len = sqrt(len);
        double dirx = -difx / len;
        double diry = -dify / len;
        dirx *= force;
        diry *= force;
        velx += (dirx / pin[i].mass + pin[i].velx) * DT;
        vely += (diry / pin[i].mass + pin[i].vely) * DT;
    }
    out[i].posx = pin[i].posx + velx;
    out[i].posy = pin[i].posy + vely;
    out[i].velx = pin[i].velx;
    out[i].vely = pin[i].vely;

    while (out[i].posx > 1)
        out[i].posx--;
    while (out[i].posx < -1)
        out[i].posx++;
    while (out[i].posy > 1)
        out[i].posy--;
    while (out[i].posy < -1)
        out[i].posy++;
}

void changeViewPort(int w, int h)
{
    glViewport(0, 0, w, h);

}

void renderMore()
{

    for (int i = 0; i < N; ++i)
    {
        if (p[i].mass == 0)
            continue;
        if (p[i].mass == 1)
            glColor3f(1, 1, 1);
        else
            glColor3f(1, 0, 0);
        glBegin(GL_LINE_LOOP);
        for (int j = 0; j <= 4; j++) {
            double angle = 2 * M_PI * j / 300;
            double x = cos(angle) * 0.001;
            double y = sin(angle) * 0.001;
            x *= p[i].mass;
            y *= p[i].mass;
            glVertex2d(x + p[i].posx, y + p[i].posy);
        }
        glEnd();
    }

}

void render(void)
{
    if(first)
    {
        frames = 0;
        starttime = clock();
        first = false;
    }
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    renderMore();
    glutSwapBuffers();
    frames++;
}

void moveCuda(Particle* in, Particle* out)
{
    Particle *device_p = nullptr;
    Particle *device_res = nullptr;
    cudaError_t cudaStatus;

    int size = N * sizeof(Particle);

    cudaStatus = cudaMalloc((void**)&device_res, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMalloc((void**)&device_p, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(device_p, in, size, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    updateParticle << <N / 1024, 1024 >> >(device_res, device_p);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    cudaStatus = cudaMemcpy(out, device_res, size, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaFree(device_res);
    cudaFree(device_p);
}

void update(int)
{
    Particle temp[N] = {};
    moveCuda(p, temp);
    for (int i = 0; i < N; ++i)
        p[i] = temp[i];
    fps = (double)frames / ((clock() - starttime) / 1000);
        const string a = "FPS: " + to_string(fps);
        glutSetWindowTitle(a.c_str());

    glutTimerFunc(100.0 / 60, update, -1);
}

void idle()
{
    glutPostRedisplay();
}

int main(int argc, char* argv[])
{
    for (int i = 0; i < N; ++i)
    {
        p[i] = Particle();
    }
    // Initialize GLUT
    glutInit(&argc, argv);
    // Set up some memory buffers for our display
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
    // Set the window size
    glutInitWindowSize(1000, 1000);
    // Create the window with the title "Hello,GL"
    glutCreateWindow("Hello World");
    // Bind the two functions (above) to respond when necessary
    glutReshapeFunc(changeViewPort);
    glutDisplayFunc(render);
    glutTimerFunc(100.0 / 60, update, -1);
    glutIdleFunc(idle);


    // Very important!  This initializes the entry points in the OpenGL driver so we can 
    // call all the functions in the API.
    GLenum err = glewInit();
    if (GLEW_OK != err) {
        fprintf(stderr, "GLEW error");
        return 1;
    }

    render();
    glutMainLoop();
    return 0;
}

Particle.cpp

#include "Particle.h"
#include "stdlib.h"
#include <host_defines.h>

Particle::Particle()
{
    posx = (((double)rand() / (RAND_MAX)) * 2) - 1;
    posy = (((double)rand() / (RAND_MAX)) * 2) - 1;
    velx = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    vely = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    mass = 1;
}

Particle.h

#pragma once

class Particle
{
public:
    Particle();
    void Update();
    double posx;
    double posy;
    double velx;
    double vely;
    double mass;
};

当我删除设置图形设备的行时,它会抛出错误,但继续以2-3 fps运行。这可能表明它在获取我的显卡时遇到了问题,但我不知道该怎么做。当我将它设置为cudaSetDevice(0)时,它不会抛出错误。显卡正常工作,显示器已连接并正常工作。

如果有人可以提供一些指示或建议,我会非常感激。

1 个答案:

答案 0 :(得分:2)

首先,您可能希望学习CUDA nbody sample code,因为它会比我更好地公开编写良好的代码。另请注意,该示例包含指向this chapter的链接,这也是有益的。

我将提供一个似乎比原始代码快得多的代码。以下是我应用的一般策略:

  1. 确保您正在构建发布项目,而不是 debug 项目。
  2. 不要执行不必要的cudaMalloc / cudaFreecudaMemcpy操作。最好一次执行这些分配,并重用它们。由于您没有在主机代码中修改任何内容(位置,速度),这也意味着我们并不需要为moveCuda的每次迭代更新设备。只需将数据保留在设备上即可。这使我们只能进行单cudaMemcpy次操作,因此我们可以执行OpenGL工作(但请参见下文)。我似乎得到了3倍的提升。我还实施了“乒乓”缓冲策略,以避免不必要的复制。

  3. 使用float代替double。这有几个好处。首先,它减少了内存流量,因为您正在检索一半的数据。其次,您使用的GPU对float的吞吐量(数学运算)要比double大得多。我真的不认为这是一个计算绑定内核,所以我认为内存流量是一个更大的问题。我似乎得到了另外3倍的提升。

  4. 将粒子从AoS转换为SoA。这个主题在cuda标签以及许多其他地方都有介绍,所以我不打算在这里查看。我没有完全做到这一点,而是我做了部分转换(将质量移到单独的数组中),然后对剩余的float4速度x / y和位置x使用了“向量加载”策略/年。 here是一个示例答案,讨论了AoS-> SoA转换及其有价值的原因,以及我在此处使用的向量加载“快捷方式”。

  5. 4096是现代GPU的相对较少的线程数。通过从1024线程块切换到512线程块,您可以看到一个小的好处。这为内核提供了一个更好的机会来填充GPU上的可用SM。如果你只有4个或更少的SM,这没有多大区别,但你的980 Ti有 22 SM,所以我们见证最高性能的最佳机会是每个至少放一个块SM。所以你甚至可能想尝试256个线程的块(总共16个线程块)。

  6. 这是一套相当“昂贵”的计算:

    len = sqrt(len);
    double dirx = -difx / len;
    double diry = -dify / len;
    

    事实证明rsqrtf()sqrtf()一样容易计算,并且我们可以将后续的浮点除法运算转换为浮点乘法运算。

    < / LI>

    通过这些基本步骤,我可以在非常老的GPU上达到大约30fps,你应该看到比这更好的东西。我正在研究linux,但我不相信我所做的任何改变都应该在windows下“破解”。

    #include <GL/glew.h>
    #include <GL/freeglut.h>
    #include <iostream>
    #include <vector>
    #include <math.h>
    #include <ctime>
    #include <string>
    #include <cstdlib>
    #include <cstdio>
    #include <time.h>
    #define N 4096
    #define DT 0.00001
    #define M_PI           3.14159265358979323846  /* pi */
    
    
    class Particle
    {
    public:
        Particle();
        float4 p;
    };
    
    Particle::Particle()
    {
        p.x = (((double)rand() / (RAND_MAX)) * 2) - 1;
        p.y = (((double)rand() / (RAND_MAX)) * 2) - 1;
        p.z = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
        p.w = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    }
    const int size = N * sizeof(Particle);
    
    
    using namespace std;
    
    Particle p[N];
    float pmass[N];
    Particle *d_p1, *d_p2;
    float *d_pmass1, *d_pmass2;
    int ping_pong = 0;
    float et;
    cudaEvent_t start, stop;
    int frames = 0;
    clock_t starttime = clock();
    clock_t timepassed = 0;
    bool first = true;
    float fps = 0.0f;
    
    __global__ void updateParticle(Particle * __restrict__  out, float * __restrict__ pmass_out,  const Particle * __restrict__ pin, const float * __restrict__ pmass_in)
    {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        float velx = 0;
        float vely = 0;
        Particle my_i = pin[i];
        float my_mass_i = pmass_in[i];
        pmass_out[i] = my_mass_i;
        for(int j = 0; j < N; j++)
        {
            float my_mass_j = pmass_in[j];
            if (i == j || my_mass_i == 0 || my_mass_j == 0)
                continue;
            Particle my_j = pin[j];
            float difx = my_i.p.x - my_j.p.x;
            float dify = my_i.p.y - my_j.p.y;
            float len = difx * difx + dify * dify;
            if (len == 0)
                continue;
            float force = (my_mass_i * my_mass_j) / len;
            len = rsqrtf(len);
            float dirx = -difx * len;
            float diry = -dify * len;
            dirx *= force;
            diry *= force;
            velx += (dirx / my_mass_i + my_i.p.z) * DT;
            vely += (diry / my_mass_i + my_i.p.w) * DT;
        }
        Particle my_out_i = my_i;
        my_out_i.p.x = my_i.p.x + velx;
        my_out_i.p.y = my_i.p.y + vely;
        my_out_i.p.z = my_i.p.z;
        my_out_i.p.w = my_i.p.w;
    
        if (my_out_i.p.x > 1)
            my_out_i.p.x = 1;
        if (my_out_i.p.x < -1)
            my_out_i.p.x = -1;
        if (my_out_i.p.y > 1)
            my_out_i.p.y = 1;
        if (my_out_i.p.y < -1)
            my_out_i.p.y = -1;
        out[i] = my_out_i;
    }
    
    void changeViewPort(int w, int h)
    {
        glViewport(0, 0, w, h);
    
    }
    
    void renderMore()
    {
    
        for (int i = 0; i < N; ++i)
        {
            if (pmass[i] == 0)
                continue;
            if (pmass[i] == 1)
                glColor3f(1, 1, 1);
            else
                glColor3f(1, 0, 0);
            glBegin(GL_LINE_LOOP);
            for (int j = 0; j <= 4; j++) {
                double angle = 2 * M_PI * j / 300;
                double x = cos(angle) * 0.001;
                double y = sin(angle) * 0.001;
                x *= pmass[i];
                y *= pmass[i];
                glVertex2d(x + p[i].p.x, y + p[i].p.y);
            }
            glEnd();
        }
    
    }
    
    void render(void)
    {
        if(first)
        {
            frames = 0;
            starttime = clock();
            first = false;
        }
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
        renderMore();
        glutSwapBuffers();
        frames++;
    }
    
    void moveCuda(Particle* in, Particle* out)
    {
        Particle *d_pi;
        Particle *d_po;
        float *d_pmassi, *d_pmasso;
        cudaError_t cudaStatus;
        if (ping_pong) {
          d_pi = d_p2;
          d_po = d_p1;
          d_pmassi = d_pmass2;
          d_pmasso = d_pmass1;
          ping_pong = 0;}
        else {
          d_pi = d_p1;
          d_po = d_p2;
          d_pmassi = d_pmass1;
          d_pmasso = d_pmass2;
          ping_pong = 1;}
        cudaEventRecord(start);
        cudaStatus = cudaSetDevice(0);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        }
        updateParticle << <N / 256, 256 >> >(d_po, d_pmasso,  d_pi, d_pmassi);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        }
    
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        }
        cudaEventRecord(stop);
    
        cudaStatus = cudaMemcpy(out, d_po, size, cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
        }
        //cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&et, start, stop);
    
    }
    
    void update(int)
    {
        Particle temp[N] = {};
        moveCuda(p, temp);
        for (int i = 0; i < N; ++i)
            p[i] = temp[i];
        char a[64];
        fps = (float)frames / ((clock() - starttime) / CLOCKS_PER_SEC);
            sprintf(a, "FPS: %f, et: %f\0", fps, et);
            glutSetWindowTitle(a);
    
        glutTimerFunc(100.0 / 60, update, -1);
    }
    
    void idle()
    {
        glutPostRedisplay();
    }
    
    int main(int argc, char* argv[])
    {
        for (int i = 0; i < N; ++i)
        {
            p[i] = Particle();
            pmass[i] = 1;
        //    p[i].p();
        }
        cudaMalloc((void**)&d_p2, size);
        cudaMalloc((void**)&d_p1, size);
        cudaMalloc((void**)&d_pmass2, N*sizeof(float));
        cudaMalloc((void**)&d_pmass1, N*sizeof(float));
        cudaMemcpy(d_p1, p, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_pmass1, pmass, N*sizeof(float), cudaMemcpyHostToDevice);
        cudaEventCreate(&start); cudaEventCreate(&stop);
        // Initialize GLUT
        glutInit(&argc, argv);
        // Set up some memory buffers for our display
        glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
        // Set the window size
        glutInitWindowSize(1000, 1000);
        // Create the window with the title "Hello,GL"
        glutCreateWindow("Hello World");
        // Bind the two functions (above) to respond when necessary
        glutReshapeFunc(changeViewPort);
        glutDisplayFunc(render);
        glutTimerFunc(100.0 / 60, update, -1);
        glutIdleFunc(idle);
    
    
        // Very important!  This initializes the entry points in the OpenGL driver so we can
        // call all the functions in the API.
        GLenum err = glewInit();
        if (GLEW_OK != err) {
            fprintf(stderr, "GLEW error");
            return 1;
        }
    
        render();
        glutMainLoop();
        return 0;
    }
    

    我并不认为这是无缺陷的代码,(我认为你的代码不是这样),但它似乎与原始代码的图形表现相同。例如,在您的代码中,您可以在内核的末尾使用它:

    out[i].velx = pin[i].velx;
    out[i].vely = pin[i].vely;
    

    这对我来说不合适,但它并不是这里讨论的表现的核心。

    如果你知道你的质量总是1或0,那么你可以对这段代码进行大量的额外优化,但我没有追求。

    作为补充说明,您可能需要考虑CUDA / OpenGL互操作策略,以摆脱保留在此处的设备 - >主机副本,并将数据永久移至GPU。同样,CUDA nbody示例代码可以是路线图,如果您想开始使用CUDA / GL interop,我认为this presentation有点过时,但是一个很好的起点。