我有一个小游戏,我想移植到Android。 PC版本最初使用(并且仍然可以使用)OpenCL进行冲突检测,其显示比cpu实现的速度高230x(!)。我想使用Compute Shaders将并行编程移动到OpenGL来切掉ocl,将它全部保存在ogl的土地上。我的第一个ogl计算着色器使用的功能与ocl版本相同,但是它比ocl版本慢14倍!
我已经在这里阅读了关于ogl计算着色器的性能的帖子,他们通常关于访问纹理。我没有纹理,只有两个SSBO;一个用于输入,一个用于输出。
任务是一个nbody问题,代码(暴力)会检查所有内容。该程序可以有数千个点(每次迭代添加和删除任何数字)。我想拥有尽可能多的并发点数。
2000分的时间是:
基本上是:
for (Point * p1: points) {
for (Point * p2: points) {
// perform collision detection
// calculate and accumulate any repulsive force on p1
}
}
并行程序的目的是使顶部循环变平。所以这是一个天真的实现,但现在对我来说没什么问题。 - 如果它对于使用ocl加速230x的速度足够好而不是ogl不应该落后,对吧?
c ++代码:
void Render::compute (int num_stuff, PointCSData * data, PointCSData_Out *& data_out) {
// data IN
glBindBuffer (GL_SHADER_STORAGE_BUFFER, csbo);
glBufferData (GL_SHADER_STORAGE_BUFFER, num_stuff * sizeof(PointCSData), data, GL_DYNAMIC_COPY);
glUnmapBuffer (GL_SHADER_STORAGE_BUFFER);
// data OUT - prep space
glBindBuffer (GL_SHADER_STORAGE_BUFFER, csbo_out);
glBufferData (GL_SHADER_STORAGE_BUFFER, num_stuff * sizeof(PointCSData_Out), NULL, GL_DYNAMIC_COPY);
glUnmapBuffer (GL_SHADER_STORAGE_BUFFER);
glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0, csbo);
glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 1, csbo_out);
glUseProgram (computeProgram);
glDispatchCompute (num_stuff / 32, 32, 1);
glMemoryBarrier (GL_ALL_BARRIER_BITS);
glBindBuffer (GL_SHADER_STORAGE_BUFFER, csbo_out);
GLuint bufMask = GL_MAP_READ_BIT;
data_out = (PointCSData_Out *) glMapBufferRange (GL_SHADER_STORAGE_BUFFER, 0, num_stuff * sizeof(PointCSData_Out), bufMask);
glUnmapBuffer (GL_SHADER_STORAGE_BUFFER);
glUseProgram (0);
}
着色器:
#version 430
struct PointData {
int is_fixed;
float posx, posy;
int thingpntr;
float friction; // not used
float rigidity;
vec2 vel; // not used
};
struct PointData_Out {
vec2 acc;
vec2 accrig;
};
layout (std430, binding=0) buffer cs_data {
PointData data [ ];
};
layout (std430, binding=1) buffer cs_data_out {
PointData_Out data_out [ ];
};
layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
void main () {
float k = 500.0; // Hookes_K
float d = 8.0; // collision_d
uint n = gl_NumWorkGroups.x * gl_NumWorkGroups.y; // num stuff
uint gid = gl_GlobalInvocationID.x;
if (data [gid].thingpntr == -1 // skip if padding point
|| data [gid].is_fixed == 1) // skip if fixed point
return;
float rig = data [gid].rigidity;
vec2 p1_pos = vec2 (data [gid].posx, data [gid].posy);
vec2 p2_pos;
vec2 acc; // accumulators
vec2 accrig;
float f, h;
vec2 dif;
vec2 norm;
vec2 F;
for (uint i=0; i<n; i++) {
if (data [gid].thingpntr == data [i].thingpntr // skip if of same thing
|| data [i].thingpntr == -1 // skip if padding point
|| i == gid) // skip if self
continue;
p2_pos = vec2 (data [i].posx, data [i].posy);
h = distance (p2_pos, p1_pos);
if (h > d || h == 0.0) continue;
f = k * (h - d);
dif = p2_pos - p1_pos;
norm = dif / h;
F = norm * f;
acc += F * (1.0 - rig);
accrig += F * rig;
}
data_out [gid].acc = acc;
data_out [gid].accrig = accrig;
}
调用glBufferData()和glMapBufferRange()时,访问位域是否有效?
同样,使ogl计算着色器版本的运行速度比功能相同的ocl版本要慢得多吗?