Question

我正在使用OpenGL ES。并且有两种类型的计算＆＃34; dir＆＃34;矢量，哪个代码最快？

attribute vec2 order;

代码1：

  if( abs(sinA) < 0.2 ) {
    if(order.x == 1.0){
        dir = sNormalPrev;   
    } else {
        dir = sNormalNext;   
    }
  } else {
    dir *= order.x / sinA;
  }

代码2：

float k = step(0.2, abs(sinA));
dir = k * dir * order.x / sinA - (k-1.0) * (step(1.0, order.x + 1.0) * sNormalPrev + step(1.0, -order.x + 1.0) * sNormalNext);

Answer 1

写一个测试我看不出太大差异

var iterationsPerTiming = 40;

var gl = document.createElement("canvas").getContext("webgl");
gl.canvas.width = 1;
gl.canvas.height = 1;
var programInfo1 = twgl.createProgramInfo(gl, ["vs1", "fs"])
var programInfo2 = twgl.createProgramInfo(gl, ["vs2", "fs"]);

var count = new Float32Array(1000000);
for (var i = 0; i < count.length; ++i) {
  count[i] = i % 3 / 2;
}

var arrays = {
  vertexId: {
    data: count, numComponents: 1,
  },
};
var bufferInfo = twgl.createBufferInfoFromArrays(gl, arrays);

iterateTest(programInfo1, 10)  // prime this path
  .then(function() { return iterateTest(programInfo2, 10)})  // prime this path
  .then(function() { return iterateTest(programInfo1, 20)})
  .then(log)
  .then(function() { return iterateTest(programInfo2, 20)})
  .then(log);

function iterateTest(programInfo, times) {
  return new Promise(function(resolve, reject) {
    var timings = [];
    var totalTime = 0;

    function runNextIteration() {
      if (times) {
         --times;
         timings.push(test(programInfo, iterationsPerTiming));
         setTimeout(runNextIteration, 1);
      } else {
        var totalTime = 0;
        var msgs = timings.map(function(timing, ndx) {
          totalTime += timing;
          return "" + ndx + ": " + timing.toFixed(3);
        });
        msgs.push("average timing: " + (totalTime / timings.length).toFixed(3));
        resolve(msgs.join("\n"));
      }
    }
    runNextIteration();
  });
}

function test(programInfo, iterations) {
  gl.useProgram(programInfo.program);
  twgl.setBuffersAndAttributes(gl, programInfo, bufferInfo);
  var startTime = performance.now();
  for (var i = 0; i < iterations; ++i) {
    twgl.drawBufferInfo(gl, gl.TRIANGLES, bufferInfo, count.length);
  }
  
  // this effectively does a gl.finish. It's not useful for real timing
  // beacuse it stalls the pipeline but it should be useful for 
  // comparing times since the stalling would be included in both
  var temp = new Uint8Array(4);
  gl.readPixels(0, 0, 1, 1, gl.RGBA, gl.UNSIGNED_BYTE, temp);
  return performance.now() - startTime;
}

function log(msg) {
  var div = document.createElement("pre");
  div.appendChild(document.createTextNode(msg));
  document.body.appendChild(div);
  return Promise.resolve();
}

html, body { font-family: monospace; }

<script src="https://twgljs.org/dist/twgl.min.js"></script>

  <script id="vs1" type="notjs">
attribute float vertexId;
void main() {
  vec2 order = vec2(vertexId, 0);
  float sinA = vertexId;
  vec3 dir = vec3(0);
  vec3 sNormalPrev = vec3(1);
  vec3 sNormalNext = vec3(-1);
  if( abs(sinA) < 0.2 ) {
    if(order.x == 1.0){
        dir = sNormalPrev;   
    } else {
        dir = sNormalNext;   
    }
  } else {
    dir *= order.x / sinA;
  }
  gl_Position = vec4(dir, 1.0); // have to use dir
  gl_PointSize = 1.0;
}
  </script>
  <script id="vs2" type="notjs">
attribute float vertexId;

void main() {
  vec2 order = vec2(vertexId, 0);
  float sinA = vertexId;
  vec3 dir = vec3(0);
  vec3 sNormalPrev = vec3(1);
  vec3 sNormalNext = vec3(-1);
  
  float k = step(0.2, abs(sinA));
  dir = k * dir * order.x / sinA - (k-1.0) * (step(1.0, order.x + 1.0) * sNormalPrev + step(1.0,   -order.x + 1.0) * sNormalNext);
  
  gl_Position = vec4(dir, 1.0); // have to use dir
  gl_PointSize = 1.0;
}
  </script>
  <script id="fs" type="notjs">
precision mediump float;
void main() {
  gl_FragColor = vec4(1);
}
  </script>

也许我的考试不好。测试了2015年初的macbook pro和iPhone6s +

Answer 2

GPU内核主要是宽SIMD单元，它们通过屏蔽处理if语句。根据GPU架构，着色器编译器将控制语句转换为屏蔽操作，与您对代码的处理方式非常相似。

在PC上，GPU驱动程序具有足够的处理能力来正确优化着色器，因此您的优化没有任何区别。根据2010年的this blog post，您的优化在移动平台上是有意义的。我认为现代智能手机的情况并非如此，因为它们具有足够的处理能力来正确优化着色器以及随着时间的推移而成熟的驱动程序。

您还可以尝试使用前面的博客文章中提到的工具GLSL optimizer。此外，一些GPU供应商还提供用于分析着色器的工具。

GLSL优化。什么更快？

2 个答案: