我一直在尝试使用CUDA / C ++,并决定制作一个n身体模拟器。它模拟了4096个粒子之间的引力吸引力。它运行大约2或3 FPS,我不完全确定原因。使用的显卡是GTX 980 Ti,所以我希望程序能够顺利运行。我知道它可能没有针对最好的能力进行优化,但我不会期望它运行得这么慢。
代码只应该是原型,所以代码不能以任何方式整洁(或正确编写)。
main.cu
#include <Windows.h>
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include "Particle.h"
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <ctime>
#include <string>
#define N 4096
#define DT 0.00001
# define M_PI 3.14159265358979323846 /* pi */
using namespace std;
Particle p[N];
int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;
__global__ void updateParticle(Particle* out, Particle *pin)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
double velx = 0;
double vely = 0;
out[i].mass = pin[i].mass;
for(int j = 0; j < N; j++)
{
if (i == j || pin[j].mass == 0 || pin[i].mass == 0)
continue;
double difx = pin[i].posx - pin[j].posx;
double dify = pin[i].posy - pin[j].posy;
double len = difx * difx + dify * dify;
if (len == 0)
continue;
double force = (pin[i].mass * pin[j].mass) / len;
len = sqrt(len);
double dirx = -difx / len;
double diry = -dify / len;
dirx *= force;
diry *= force;
velx += (dirx / pin[i].mass + pin[i].velx) * DT;
vely += (diry / pin[i].mass + pin[i].vely) * DT;
}
out[i].posx = pin[i].posx + velx;
out[i].posy = pin[i].posy + vely;
out[i].velx = pin[i].velx;
out[i].vely = pin[i].vely;
while (out[i].posx > 1)
out[i].posx--;
while (out[i].posx < -1)
out[i].posx++;
while (out[i].posy > 1)
out[i].posy--;
while (out[i].posy < -1)
out[i].posy++;
}
void changeViewPort(int w, int h)
{
glViewport(0, 0, w, h);
}
void renderMore()
{
for (int i = 0; i < N; ++i)
{
if (p[i].mass == 0)
continue;
if (p[i].mass == 1)
glColor3f(1, 1, 1);
else
glColor3f(1, 0, 0);
glBegin(GL_LINE_LOOP);
for (int j = 0; j <= 4; j++) {
double angle = 2 * M_PI * j / 300;
double x = cos(angle) * 0.001;
double y = sin(angle) * 0.001;
x *= p[i].mass;
y *= p[i].mass;
glVertex2d(x + p[i].posx, y + p[i].posy);
}
glEnd();
}
}
void render(void)
{
if(first)
{
frames = 0;
starttime = clock();
first = false;
}
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
renderMore();
glutSwapBuffers();
frames++;
}
void moveCuda(Particle* in, Particle* out)
{
Particle *device_p = nullptr;
Particle *device_res = nullptr;
cudaError_t cudaStatus;
int size = N * sizeof(Particle);
cudaStatus = cudaMalloc((void**)&device_res, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&device_p, size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(device_p, in, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
updateParticle << <N / 1024, 1024 >> >(device_res, device_p);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(out, device_res, size, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaFree(device_res);
cudaFree(device_p);
}
void update(int)
{
Particle temp[N] = {};
moveCuda(p, temp);
for (int i = 0; i < N; ++i)
p[i] = temp[i];
fps = (double)frames / ((clock() - starttime) / 1000);
const string a = "FPS: " + to_string(fps);
glutSetWindowTitle(a.c_str());
glutTimerFunc(100.0 / 60, update, -1);
}
void idle()
{
glutPostRedisplay();
}
int main(int argc, char* argv[])
{
for (int i = 0; i < N; ++i)
{
p[i] = Particle();
}
// Initialize GLUT
glutInit(&argc, argv);
// Set up some memory buffers for our display
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
// Set the window size
glutInitWindowSize(1000, 1000);
// Create the window with the title "Hello,GL"
glutCreateWindow("Hello World");
// Bind the two functions (above) to respond when necessary
glutReshapeFunc(changeViewPort);
glutDisplayFunc(render);
glutTimerFunc(100.0 / 60, update, -1);
glutIdleFunc(idle);
// Very important! This initializes the entry points in the OpenGL driver so we can
// call all the functions in the API.
GLenum err = glewInit();
if (GLEW_OK != err) {
fprintf(stderr, "GLEW error");
return 1;
}
render();
glutMainLoop();
return 0;
}
Particle.cpp
#include "Particle.h"
#include "stdlib.h"
#include <host_defines.h>
Particle::Particle()
{
posx = (((double)rand() / (RAND_MAX)) * 2) - 1;
posy = (((double)rand() / (RAND_MAX)) * 2) - 1;
velx = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
vely = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
mass = 1;
}
Particle.h
#pragma once
class Particle
{
public:
Particle();
void Update();
double posx;
double posy;
double velx;
double vely;
double mass;
};
当我删除设置图形设备的行时,它会抛出错误,但继续以2-3 fps运行。这可能表明它在获取我的显卡时遇到了问题,但我不知道该怎么做。当我将它设置为cudaSetDevice(0)时,它不会抛出错误。显卡正常工作,显示器已连接并正常工作。
如果有人可以提供一些指示或建议,我会非常感激。
答案 0 :(得分:2)
首先,您可能希望学习CUDA nbody sample code,因为它会比我更好地公开编写良好的代码。另请注意,该示例包含指向this chapter的链接,这也是有益的。
我将提供一个似乎比原始代码快得多的代码。以下是我应用的一般策略:
不要执行不必要的cudaMalloc
/ cudaFree
或cudaMemcpy
操作。最好一次执行这些分配,并重用它们。由于您没有在主机代码中修改任何内容(位置,速度),这也意味着我们并不需要为moveCuda
的每次迭代更新设备。只需将数据保留在设备上即可。这使我们只能进行单cudaMemcpy
次操作,因此我们可以执行OpenGL工作(但请参见下文)。我似乎得到了3倍的提升。我还实施了“乒乓”缓冲策略,以避免不必要的复制。
使用float
代替double
。这有几个好处。首先,它减少了内存流量,因为您正在检索一半的数据。其次,您使用的GPU对float
的吞吐量(数学运算)要比double
大得多。我真的不认为这是一个计算绑定内核,所以我认为内存流量是一个更大的问题。我似乎得到了另外3倍的提升。
将粒子从AoS转换为SoA。这个主题在cuda
标签以及许多其他地方都有介绍,所以我不打算在这里查看。我没有完全做到这一点,而是我做了部分转换(将质量移到单独的数组中),然后对剩余的float4
速度x / y和位置x使用了“向量加载”策略/年。 here是一个示例答案,讨论了AoS-> SoA转换及其有价值的原因,以及我在此处使用的向量加载“快捷方式”。
4096是现代GPU的相对较少的线程数。通过从1024线程块切换到512线程块,您可以看到一个小的好处。这为内核提供了一个更好的机会来填充GPU上的可用SM。如果你只有4个或更少的SM,这没有多大区别,但你的980 Ti有 22 SM,所以我们见证最高性能的最佳机会是每个至少放一个块SM。所以你甚至可能想尝试256个线程的块(总共16个线程块)。
这是一套相当“昂贵”的计算:
len = sqrt(len);
double dirx = -difx / len;
double diry = -dify / len;
事实证明rsqrtf()
与sqrtf()
一样容易计算,并且我们可以将后续的浮点除法运算转换为浮点乘法运算。
通过这些基本步骤,我可以在非常老的GPU上达到大约30fps,你应该看到比这更好的东西。我正在研究linux,但我不相信我所做的任何改变都应该在windows下“破解”。
#include <GL/glew.h>
#include <GL/freeglut.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <ctime>
#include <string>
#include <cstdlib>
#include <cstdio>
#include <time.h>
#define N 4096
#define DT 0.00001
#define M_PI 3.14159265358979323846 /* pi */
class Particle
{
public:
Particle();
float4 p;
};
Particle::Particle()
{
p.x = (((double)rand() / (RAND_MAX)) * 2) - 1;
p.y = (((double)rand() / (RAND_MAX)) * 2) - 1;
p.z = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
p.w = ((((double)rand() / (RAND_MAX)) * 2) - 1) / 4;
}
const int size = N * sizeof(Particle);
using namespace std;
Particle p[N];
float pmass[N];
Particle *d_p1, *d_p2;
float *d_pmass1, *d_pmass2;
int ping_pong = 0;
float et;
cudaEvent_t start, stop;
int frames = 0;
clock_t starttime = clock();
clock_t timepassed = 0;
bool first = true;
float fps = 0.0f;
__global__ void updateParticle(Particle * __restrict__ out, float * __restrict__ pmass_out, const Particle * __restrict__ pin, const float * __restrict__ pmass_in)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
float velx = 0;
float vely = 0;
Particle my_i = pin[i];
float my_mass_i = pmass_in[i];
pmass_out[i] = my_mass_i;
for(int j = 0; j < N; j++)
{
float my_mass_j = pmass_in[j];
if (i == j || my_mass_i == 0 || my_mass_j == 0)
continue;
Particle my_j = pin[j];
float difx = my_i.p.x - my_j.p.x;
float dify = my_i.p.y - my_j.p.y;
float len = difx * difx + dify * dify;
if (len == 0)
continue;
float force = (my_mass_i * my_mass_j) / len;
len = rsqrtf(len);
float dirx = -difx * len;
float diry = -dify * len;
dirx *= force;
diry *= force;
velx += (dirx / my_mass_i + my_i.p.z) * DT;
vely += (diry / my_mass_i + my_i.p.w) * DT;
}
Particle my_out_i = my_i;
my_out_i.p.x = my_i.p.x + velx;
my_out_i.p.y = my_i.p.y + vely;
my_out_i.p.z = my_i.p.z;
my_out_i.p.w = my_i.p.w;
if (my_out_i.p.x > 1)
my_out_i.p.x = 1;
if (my_out_i.p.x < -1)
my_out_i.p.x = -1;
if (my_out_i.p.y > 1)
my_out_i.p.y = 1;
if (my_out_i.p.y < -1)
my_out_i.p.y = -1;
out[i] = my_out_i;
}
void changeViewPort(int w, int h)
{
glViewport(0, 0, w, h);
}
void renderMore()
{
for (int i = 0; i < N; ++i)
{
if (pmass[i] == 0)
continue;
if (pmass[i] == 1)
glColor3f(1, 1, 1);
else
glColor3f(1, 0, 0);
glBegin(GL_LINE_LOOP);
for (int j = 0; j <= 4; j++) {
double angle = 2 * M_PI * j / 300;
double x = cos(angle) * 0.001;
double y = sin(angle) * 0.001;
x *= pmass[i];
y *= pmass[i];
glVertex2d(x + p[i].p.x, y + p[i].p.y);
}
glEnd();
}
}
void render(void)
{
if(first)
{
frames = 0;
starttime = clock();
first = false;
}
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
renderMore();
glutSwapBuffers();
frames++;
}
void moveCuda(Particle* in, Particle* out)
{
Particle *d_pi;
Particle *d_po;
float *d_pmassi, *d_pmasso;
cudaError_t cudaStatus;
if (ping_pong) {
d_pi = d_p2;
d_po = d_p1;
d_pmassi = d_pmass2;
d_pmasso = d_pmass1;
ping_pong = 0;}
else {
d_pi = d_p1;
d_po = d_p2;
d_pmassi = d_pmass1;
d_pmasso = d_pmass2;
ping_pong = 1;}
cudaEventRecord(start);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
updateParticle << <N / 256, 256 >> >(d_po, d_pmasso, d_pi, d_pmassi);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaEventRecord(stop);
cudaStatus = cudaMemcpy(out, d_po, size, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
//cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&et, start, stop);
}
void update(int)
{
Particle temp[N] = {};
moveCuda(p, temp);
for (int i = 0; i < N; ++i)
p[i] = temp[i];
char a[64];
fps = (float)frames / ((clock() - starttime) / CLOCKS_PER_SEC);
sprintf(a, "FPS: %f, et: %f\0", fps, et);
glutSetWindowTitle(a);
glutTimerFunc(100.0 / 60, update, -1);
}
void idle()
{
glutPostRedisplay();
}
int main(int argc, char* argv[])
{
for (int i = 0; i < N; ++i)
{
p[i] = Particle();
pmass[i] = 1;
// p[i].p();
}
cudaMalloc((void**)&d_p2, size);
cudaMalloc((void**)&d_p1, size);
cudaMalloc((void**)&d_pmass2, N*sizeof(float));
cudaMalloc((void**)&d_pmass1, N*sizeof(float));
cudaMemcpy(d_p1, p, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_pmass1, pmass, N*sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start); cudaEventCreate(&stop);
// Initialize GLUT
glutInit(&argc, argv);
// Set up some memory buffers for our display
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH);
// Set the window size
glutInitWindowSize(1000, 1000);
// Create the window with the title "Hello,GL"
glutCreateWindow("Hello World");
// Bind the two functions (above) to respond when necessary
glutReshapeFunc(changeViewPort);
glutDisplayFunc(render);
glutTimerFunc(100.0 / 60, update, -1);
glutIdleFunc(idle);
// Very important! This initializes the entry points in the OpenGL driver so we can
// call all the functions in the API.
GLenum err = glewInit();
if (GLEW_OK != err) {
fprintf(stderr, "GLEW error");
return 1;
}
render();
glutMainLoop();
return 0;
}
我并不认为这是无缺陷的代码,(我认为你的代码不是这样),但它似乎与原始代码的图形表现相同。例如,在您的代码中,您可以在内核的末尾使用它:
out[i].velx = pin[i].velx;
out[i].vely = pin[i].vely;
这对我来说不合适,但它并不是这里讨论的表现的核心。
如果你知道你的质量总是1或0,那么你可以对这段代码进行大量的额外优化,但我没有追求。
作为补充说明,您可能需要考虑CUDA / OpenGL互操作策略,以摆脱保留在此处的设备 - >主机副本,并将数据永久移至GPU。同样,CUDA nbody示例代码可以是路线图,如果您想开始使用CUDA / GL interop,我认为this presentation有点过时,但是一个很好的起点。