Question

我一直在尝试使用CUDA / C ++，并决定制作一个n身体模拟器。它模拟了4096个粒子之间的引力吸引力。它运行大约2或3 FPS，我不完全确定原因。使用的显卡是GTX 980 Ti，所以我希望程序能够顺利运行。我知道它可能没有针对最好的能力进行优化，但我不会期望它运行得这么慢。

代码只应该是原型，所以代码不能以任何方式整洁（或正确编写）。

main.cu

#include <Windows.h>
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include "Particle.h"
#include <cuda_runtime.h>

#include <device_launch_parameters.h>
#include <ctime>
#include <string>

#define N 4096
#define DT 0.00001
# define M_PI           3.14159265358979323846  /* pi */

using namespace std;

Particle p[N];

int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;

__global__ void updateParticle(Particle* out, Particle *pin)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    double velx = 0;
    double vely = 0;
    out[i].mass = pin[i].mass;
    for(int j = 0; j < N; j++)
    {
        if (i == j || pin[j].mass == 0 || pin[i].mass == 0)
            continue;
        double difx = pin[i].posx - pin[j].posx;
        double dify = pin[i].posy - pin[j].posy;
        double len = difx * difx + dify * dify;
        if (len == 0)
            continue;
        double force = (pin[i].mass * pin[j].mass) / len;
        len = sqrt(len);
        double dirx = -difx / len;
        double diry = -dify / len;
        dirx *= force;
        diry *= force;
        velx += (dirx / pin[i].mass + pin[i].velx) * DT;
        vely += (diry / pin[i].mass + pin[i].vely) * DT;
    }
    out[i].posx = pin[i].posx + velx;
    out[i].posy = pin[i].posy + vely;
    out[i].velx = pin[i].velx;
    out[i].vely = pin[i].vely;

    while (out[i].posx > 1)
        out[i].posx--;
    while (out[i].posx < -1)
        out[i].posx++;
    while (out[i].posy > 1)
        out[i].posy--;
    while (out[i].posy < -1)
        out[i].posy++;
}

void changeViewPort(int w, int h)
{
    glViewport(0, 0, w, h);

}

void renderMore()
{

    for (int i = 0; i < N; ++i)
    {
        if (p[i].mass == 0)
            continue;
        if (p[i].mass == 1)
            glColor3f(1, 1, 1);
        else
            glColor3f(1, 0, 0);
        glBegin(GL_LINE_LOOP);
        for (int j = 0; j <= 4; j++) {
            double angle = 2 * M_PI * j / 300;
            double x = cos(angle) * 0.001;
            double y = sin(angle) * 0.001;
            x *= p[i].mass;
            y *= p[i].mass;
            glVertex2d(x + p[i].posx, y + p[i].posy);
        }
        glEnd();
    }

}

void render(void)
{
    if(first)
    {
        frames = 0;
        starttime = clock();
        first = false;
    }
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    renderMore();
    glutSwapBuffers();
    frames++;
}

void moveCuda(Particle* in, Particle* out)
{
    Particle *device_p = nullptr;
    Particle *device_res = nullptr;
    cudaError_t cudaStatus;

    int size = N * sizeof(Particle);

    cudaStatus = cudaMalloc((void**)&device_res, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMalloc((void**)&device_p, size);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(device_p, in, size, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    updateParticle << <N / 1024, 1024 >> >(device_res, device_p);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

    cudaStatus = cudaMemcpy(out, device_res, size, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaFree(device_res);
    cudaFree(device_p);
}

void update(int)
{
    Particle temp[N] = {};
    moveCuda(p, temp);
    for (int i = 0; i < N; ++i)
        p[i] = temp[i];
    fps = (double)frames / ((clock() - starttime) / 1000);
        const string a = "FPS: " + to_string(fps);
        glutSetWindowTitle(a.c_str());

    glutTimerFunc(100.0 / 60, update, -1);
}

void idle()
{
    glutPostRedisplay();
}

int main(int argc, char* argv[])
{
    for (int i = 0; i < N; ++i)
    {
        p[i] = Particle();
    }
    // Initialize GLUT
    glutInit(&argc, argv);
    // Set up some memory buffers for our display
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
    // Set the window size
    glutInitWindowSize(1000, 1000);
    // Create the window with the title "Hello,GL"
    glutCreateWindow("Hello World");
    // Bind the two functions (above) to respond when necessary
    glutReshapeFunc(changeViewPort);
    glutDisplayFunc(render);
    glutTimerFunc(100.0 / 60, update, -1);
    glutIdleFunc(idle);


    // Very important!  This initializes the entry points in the OpenGL driver so we can 
    // call all the functions in the API.
    GLenum err = glewInit();
    if (GLEW_OK != err) {
        fprintf(stderr, "GLEW error");
        return 1;
    }

    render();
    glutMainLoop();
    return 0;
}

Particle.cpp

#include "Particle.h"
#include "stdlib.h"
#include <host_defines.h>

Particle::Particle()
{
    posx = (((double)rand() / (RAND_MAX)) * 2) - 1;
    posy = (((double)rand() / (RAND_MAX)) * 2) - 1;
    velx = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    vely = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    mass = 1;
}

Particle.h

#pragma once

class Particle
{
public:
    Particle();
    void Update();
    double posx;
    double posy;
    double velx;
    double vely;
    double mass;
};

当我删除设置图形设备的行时，它会抛出错误，但继续以2-3 fps运行。这可能表明它在获取我的显卡时遇到了问题，但我不知道该怎么做。当我将它设置为cudaSetDevice（0）时，它不会抛出错误。显卡正常工作，显示器已连接并正常工作。

如果有人可以提供一些指示或建议，我会非常感激。

Answer 1

首先，您可能希望学习CUDA nbody sample code，因为它会比我更好地公开编写良好的代码。另请注意，该示例包含指向this chapter的链接，这也是有益的。

我将提供一个似乎比原始代码快得多的代码。以下是我应用的一般策略：

确保您正在构建发布项目，而不是 debug 项目。
不要执行不必要的cudaMalloc / cudaFree或cudaMemcpy操作。最好一次执行这些分配，并重用它们。由于您没有在主机代码中修改任何内容（位置，速度），这也意味着我们并不需要为moveCuda的每次迭代更新设备。只需将数据保留在设备上即可。这使我们只能进行单cudaMemcpy次操作，因此我们可以执行OpenGL工作（但请参见下文）。我似乎得到了3倍的提升。我还实施了“乒乓”缓冲策略，以避免不必要的复制。
使用float代替double。这有几个好处。首先，它减少了内存流量，因为您正在检索一半的数据。其次，您使用的GPU对float的吞吐量（数学运算）要比double大得多。我真的不认为这是一个计算绑定内核，所以我认为内存流量是一个更大的问题。我似乎得到了另外3倍的提升。
将粒子从AoS转换为SoA。这个主题在cuda标签以及许多其他地方都有介绍，所以我不打算在这里查看。我没有完全做到这一点，而是我做了部分转换（将质量移到单独的数组中），然后对剩余的float4速度x / y和位置x使用了“向量加载”策略/年。 here是一个示例答案，讨论了AoS-> SoA转换及其有价值的原因，以及我在此处使用的向量加载“快捷方式”。
4096是现代GPU的相对较少的线程数。通过从1024线程块切换到512线程块，您可以看到一个小的好处。这为内核提供了一个更好的机会来填充GPU上的可用SM。如果你只有4个或更少的SM，这没有多大区别，但你的980 Ti有 22 SM，所以我们见证最高性能的最佳机会是每个至少放一个块SM。所以你甚至可能想尝试256个线程的块（总共16个线程块）。

这是一套相当“昂贵”的计算：

len = sqrt(len);
double dirx = -difx / len;
double diry = -dify / len;

事实证明rsqrtf()与sqrtf()一样容易计算，并且我们可以将后续的浮点除法运算转换为浮点乘法运算。

< / LI>

通过这些基本步骤，我可以在非常老的GPU上达到大约30fps，你应该看到比这更好的东西。我正在研究linux，但我不相信我所做的任何改变都应该在windows下“破解”。

#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <ctime>
#include <string>
#include <cstdlib>
#include <cstdio>
#include <time.h>
#define N 4096
#define DT 0.00001
#define M_PI           3.14159265358979323846  /* pi */


class Particle
{
public:
    Particle();
    float4 p;
};

Particle::Particle()
{
    p.x = (((double)rand() / (RAND_MAX)) * 2) - 1;
    p.y = (((double)rand() / (RAND_MAX)) * 2) - 1;
    p.z = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
    p.w = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
}
const int size = N * sizeof(Particle);


using namespace std;

Particle p[N];
float pmass[N];
Particle *d_p1, *d_p2;
float *d_pmass1, *d_pmass2;
int ping_pong = 0;
float et;
cudaEvent_t start, stop;
int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;

__global__ void updateParticle(Particle * __restrict__  out, float * __restrict__ pmass_out,  const Particle * __restrict__ pin, const float * __restrict__ pmass_in)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    float velx = 0;
    float vely = 0;
    Particle my_i = pin[i];
    float my_mass_i = pmass_in[i];
    pmass_out[i] = my_mass_i;
    for(int j = 0; j < N; j++)
    {
        float my_mass_j = pmass_in[j];
        if (i == j || my_mass_i == 0 || my_mass_j == 0)
            continue;
        Particle my_j = pin[j];
        float difx = my_i.p.x - my_j.p.x;
        float dify = my_i.p.y - my_j.p.y;
        float len = difx * difx + dify * dify;
        if (len == 0)
            continue;
        float force = (my_mass_i * my_mass_j) / len;
        len = rsqrtf(len);
        float dirx = -difx * len;
        float diry = -dify * len;
        dirx *= force;
        diry *= force;
        velx += (dirx / my_mass_i + my_i.p.z) * DT;
        vely += (diry / my_mass_i + my_i.p.w) * DT;
    }
    Particle my_out_i = my_i;
    my_out_i.p.x = my_i.p.x + velx;
    my_out_i.p.y = my_i.p.y + vely;
    my_out_i.p.z = my_i.p.z;
    my_out_i.p.w = my_i.p.w;

    if (my_out_i.p.x > 1)
        my_out_i.p.x = 1;
    if (my_out_i.p.x < -1)
        my_out_i.p.x = -1;
    if (my_out_i.p.y > 1)
        my_out_i.p.y = 1;
    if (my_out_i.p.y < -1)
        my_out_i.p.y = -1;
    out[i] = my_out_i;
}

void changeViewPort(int w, int h)
{
    glViewport(0, 0, w, h);

}

void renderMore()
{

    for (int i = 0; i < N; ++i)
    {
        if (pmass[i] == 0)
            continue;
        if (pmass[i] == 1)
            glColor3f(1, 1, 1);
        else
            glColor3f(1, 0, 0);
        glBegin(GL_LINE_LOOP);
        for (int j = 0; j <= 4; j++) {
            double angle = 2 * M_PI * j / 300;
            double x = cos(angle) * 0.001;
            double y = sin(angle) * 0.001;
            x *= pmass[i];
            y *= pmass[i];
            glVertex2d(x + p[i].p.x, y + p[i].p.y);
        }
        glEnd();
    }

}

void render(void)
{
    if(first)
    {
        frames = 0;
        starttime = clock();
        first = false;
    }
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    renderMore();
    glutSwapBuffers();
    frames++;
}

void moveCuda(Particle* in, Particle* out)
{
    Particle *d_pi;
    Particle *d_po;
    float *d_pmassi, *d_pmasso;
    cudaError_t cudaStatus;
    if (ping_pong) {
      d_pi = d_p2;
      d_po = d_p1;
      d_pmassi = d_pmass2;
      d_pmasso = d_pmass1;
      ping_pong = 0;}
    else {
      d_pi = d_p1;
      d_po = d_p2;
      d_pmassi = d_pmass1;
      d_pmasso = d_pmass2;
      ping_pong = 1;}
    cudaEventRecord(start);
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }
    updateParticle << <N / 256, 256 >> >(d_po, d_pmasso,  d_pi, d_pmassi);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }
    cudaEventRecord(stop);

    cudaStatus = cudaMemcpy(out, d_po, size, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    //cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&et, start, stop);

}

void update(int)
{
    Particle temp[N] = {};
    moveCuda(p, temp);
    for (int i = 0; i < N; ++i)
        p[i] = temp[i];
    char a[64];
    fps = (float)frames / ((clock() - starttime) / CLOCKS_PER_SEC);
        sprintf(a, "FPS: %f, et: %f\0", fps, et);
        glutSetWindowTitle(a);

    glutTimerFunc(100.0 / 60, update, -1);
}

void idle()
{
    glutPostRedisplay();
}

int main(int argc, char* argv[])
{
    for (int i = 0; i < N; ++i)
    {
        p[i] = Particle();
        pmass[i] = 1;
    //    p[i].p();
    }
    cudaMalloc((void**)&d_p2, size);
    cudaMalloc((void**)&d_p1, size);
    cudaMalloc((void**)&d_pmass2, N*sizeof(float));
    cudaMalloc((void**)&d_pmass1, N*sizeof(float));
    cudaMemcpy(d_p1, p, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_pmass1, pmass, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaEventCreate(&start); cudaEventCreate(&stop);
    // Initialize GLUT
    glutInit(&argc, argv);
    // Set up some memory buffers for our display
    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
    // Set the window size
    glutInitWindowSize(1000, 1000);
    // Create the window with the title "Hello,GL"
    glutCreateWindow("Hello World");
    // Bind the two functions (above) to respond when necessary
    glutReshapeFunc(changeViewPort);
    glutDisplayFunc(render);
    glutTimerFunc(100.0 / 60, update, -1);
    glutIdleFunc(idle);


    // Very important!  This initializes the entry points in the OpenGL driver so we can
    // call all the functions in the API.
    GLenum err = glewInit();
    if (GLEW_OK != err) {
        fprintf(stderr, "GLEW error");
        return 1;
    }

    render();
    glutMainLoop();
    return 0;
}

我并不认为这是无缺陷的代码，（我认为你的代码不是这样），但它似乎与原始代码的图形表现相同。例如，在您的代码中，您可以在内核的末尾使用它：

out[i].velx = pin[i].velx;
out[i].vely = pin[i].vely;

这对我来说不合适，但它并不是这里讨论的表现的核心。

如果你知道你的质量总是1或0，那么你可以对这段代码进行大量的额外优化，但我没有追求。

作为补充说明，您可能需要考虑CUDA / OpenGL互操作策略，以摆脱保留在此处的设备 - >主机副本，并将数据永久移至GPU。同样，CUDA nbody示例代码可以是路线图，如果您想开始使用CUDA / GL interop，我认为this presentation有点过时，但是一个很好的起点。

CUDA计划的工作速度不如预期

1 个答案: