我想通过创建asm.js模块从three.js加速multiplyMatrices函数。当我最终设法让它运行时,看起来它比使用Float32Array操作的本机JavaScript慢
我的asm.js正在使用http://fhtr.blogspot.co.uk/2010/02/4x4-float-matrix-multiplication-using.html中的代码:
#include <xmmintrin.h>
#include <iostream>
extern "C" {
int main()
{
}
struct vec4
{
__m128 xmm;
vec4(__m128 v) : xmm(v) {}
vec4(float v) { xmm = _mm_set1_ps(v); }
vec4(float x, float y, float z, float w)
{
xmm = _mm_set_ps(w, z, y, x);
}
vec4(const float *v) { xmm = _mm_load_ps(v); }
vec4 operator* (const vec4 &v) const
{
return vec4(_mm_mul_ps(xmm, v.xmm));
}
vec4 operator+ (const vec4 &v) const
{
return vec4(_mm_add_ps(xmm, v.xmm));
}
vec4 operator- (const vec4 &v) const
{
return vec4(_mm_sub_ps(xmm, v.xmm));
}
vec4 operator/ (const vec4 &v) const
{
return vec4(_mm_div_ps(xmm, v.xmm));
}
void operator*= (const vec4 &v)
{
xmm = _mm_mul_ps(xmm, v.xmm);
}
void operator+= (const vec4 &v)
{
xmm = _mm_add_ps(xmm, v.xmm);
}
void operator-= (const vec4 &v)
{
xmm = _mm_sub_ps(xmm, v.xmm);
}
void operator/= (const vec4 &v)
{
xmm = _mm_div_ps(xmm, v.xmm);
}
void operator>> (float *v)
{
_mm_store_ps(v, xmm);
}
};
int mmul_vec4 (float *a, float *b, float *r)
{
for (int i = 0; i < 16; i += 4) {
vec4 rl = vec4(a) * vec4(b[i]);
for (int j = 1; j < 4; j++) {
rl += vec4(&a[j * 4]) * vec4(b[i + j]);
}
rl >> &r[i];
}
return 0;
}
}
运行此asm.js模块的JavaScript代码(malloc移动到init,因为该函数正在重复使用500 /秒),大多数指针都是手动计算以减少代码量。无论如何,这仍然比Float32Array原生JavaScript版本慢〜2倍。
Maths.prototype.init = function () {
// Alloc memory we need (16 fields x 4 bytes each + 8 additional for each field) * 3 = 216
this.dataPtr = Module._malloc(216);
// Import function from Emscripten generated file
this.mmul_vec4 = Module.cwrap(
'mmul_vec4', 'number', ['number', 'number', 'number']
);
};
Maths.prototype.matrix4multiply = function(matrix1, matrix2, target, callback) {
App3d.Maths.prototype.matrix4multiplyBlocking = function(data, data2, dataTarget) {
// Copy data to Emscripten heap
new Uint8Array(Module.HEAPU8.buffer, this.dataPtr, 64).set( new Uint8Array(data.buffer, data.byteOffset, 64) );
// second matrix allocate and copy to emscripten's heap
new Uint8Array(Module.HEAPU8.buffer, this.dataPtr + 72, 64).set( new Uint8Array(data2.buffer, data2.byteOffset, 64) );
// multiply first two parameters and return in the last one
this.mmul_vec4(this.dataPtr, this.dataPtr + 72, this.dataPtr + 144);
dataTarget.set(new Float32Array(Module.HEAPU8.buffer, this.dataPtr + 144, 16));
};
Original Three.js功能
multiplyMatrices: function ( a, b ) {
var ae = a.elements;
var be = b.elements;
var te = this.elements;
var a11 = ae[ 0 ], a12 = ae[ 4 ], a13 = ae[ 8 ], a14 = ae[ 12 ];
var a21 = ae[ 1 ], a22 = ae[ 5 ], a23 = ae[ 9 ], a24 = ae[ 13 ];
var a31 = ae[ 2 ], a32 = ae[ 6 ], a33 = ae[ 10 ], a34 = ae[ 14 ];
var a41 = ae[ 3 ], a42 = ae[ 7 ], a43 = ae[ 11 ], a44 = ae[ 15 ];
var b11 = be[ 0 ], b12 = be[ 4 ], b13 = be[ 8 ], b14 = be[ 12 ];
var b21 = be[ 1 ], b22 = be[ 5 ], b23 = be[ 9 ], b24 = be[ 13 ];
var b31 = be[ 2 ], b32 = be[ 6 ], b33 = be[ 10 ], b34 = be[ 14 ];
var b41 = be[ 3 ], b42 = be[ 7 ], b43 = be[ 11 ], b44 = be[ 15 ];
te[ 0 ] = a11 * b11 + a12 * b21 + a13 * b31 + a14 * b41;
te[ 4 ] = a11 * b12 + a12 * b22 + a13 * b32 + a14 * b42;
te[ 8 ] = a11 * b13 + a12 * b23 + a13 * b33 + a14 * b43;
te[ 12 ] = a11 * b14 + a12 * b24 + a13 * b34 + a14 * b44;
te[ 1 ] = a21 * b11 + a22 * b21 + a23 * b31 + a24 * b41;
te[ 5 ] = a21 * b12 + a22 * b22 + a23 * b32 + a24 * b42;
te[ 9 ] = a21 * b13 + a22 * b23 + a23 * b33 + a24 * b43;
te[ 13 ] = a21 * b14 + a22 * b24 + a23 * b34 + a24 * b44;
te[ 2 ] = a31 * b11 + a32 * b21 + a33 * b31 + a34 * b41;
te[ 6 ] = a31 * b12 + a32 * b22 + a33 * b32 + a34 * b42;
te[ 10 ] = a31 * b13 + a32 * b23 + a33 * b33 + a34 * b43;
te[ 14 ] = a31 * b14 + a32 * b24 + a33 * b34 + a34 * b44;
te[ 3 ] = a41 * b11 + a42 * b21 + a43 * b31 + a44 * b41;
te[ 7 ] = a41 * b12 + a42 * b22 + a43 * b32 + a44 * b42;
te[ 11 ] = a41 * b13 + a42 * b23 + a43 * b33 + a44 * b43;
te[ 15 ] = a41 * b14 + a42 * b24 + a43 * b34 + a44 * b44;
return this;
},