原生JavaScript Float32Array比使用SIMD的asm.js更快?

时间:2015-07-24 19:09:38

标签: matrix three.js simd emscripten asm.js

我想通过创建asm.js模块从three.js加速multiplyMatrices函数。当我最终设法让它运行时,看起来它比使用Float32Array操作的本机JavaScript慢

我的asm.js正在使用http://fhtr.blogspot.co.uk/2010/02/4x4-float-matrix-multiplication-using.html中的代码:

#include <xmmintrin.h>

#include <iostream>

extern "C" {
int main()
{
}
struct vec4
{
    __m128 xmm;

    vec4(__m128 v) : xmm(v) {}

    vec4(float v) { xmm = _mm_set1_ps(v); }

    vec4(float x, float y, float z, float w)
    {
        xmm = _mm_set_ps(w, z, y, x);
    }

    vec4(const float *v) { xmm = _mm_load_ps(v); }

    vec4 operator* (const vec4 &v) const
    {
        return vec4(_mm_mul_ps(xmm, v.xmm));
    }

    vec4 operator+ (const vec4 &v) const
    {
        return vec4(_mm_add_ps(xmm, v.xmm));
    }

    vec4 operator- (const vec4 &v) const
    {
        return vec4(_mm_sub_ps(xmm, v.xmm));
    }

    vec4 operator/ (const vec4 &v) const
    {
        return vec4(_mm_div_ps(xmm, v.xmm));
    }

    void operator*= (const vec4 &v)
    {
        xmm = _mm_mul_ps(xmm, v.xmm);
    }

    void operator+= (const vec4 &v)
    {
        xmm = _mm_add_ps(xmm, v.xmm);
    }

    void operator-= (const vec4 &v)
    {
        xmm = _mm_sub_ps(xmm, v.xmm);
    }

    void operator/= (const vec4 &v)
    {
        xmm = _mm_div_ps(xmm, v.xmm);
    }

    void operator>> (float *v)
    {
        _mm_store_ps(v, xmm);
    }

};

int mmul_vec4 (float *a, float *b, float *r)
{
    for (int i = 0; i < 16; i += 4) {
        vec4 rl = vec4(a) * vec4(b[i]);
        for (int j = 1; j < 4; j++) {
            rl += vec4(&a[j * 4]) * vec4(b[i + j]);
        }
        rl >> &r[i];
    }
    return 0;
}
}

运行此asm.js模块的JavaScript代码(malloc移动到init,因为该函数正在重复使用500 /秒),大多数指针都是手动计算以减少代码量。无论如何,这仍然比Float32Array原生JavaScript版本慢〜2倍。

Maths.prototype.init = function () {
    // Alloc memory we need (16 fields x 4 bytes each + 8 additional for each field) * 3 = 216

    this.dataPtr = Module._malloc(216);
    // Import function from Emscripten generated file
    this.mmul_vec4 = Module.cwrap(
      'mmul_vec4', 'number', ['number', 'number', 'number']
    );
};
Maths.prototype.matrix4multiply = function(matrix1, matrix2, target, callback) {
            App3d.Maths.prototype.matrix4multiplyBlocking = function(data, data2, dataTarget) {
    // Copy data to Emscripten heap
    new Uint8Array(Module.HEAPU8.buffer, this.dataPtr, 64).set( new Uint8Array(data.buffer, data.byteOffset, 64) );

    // second matrix allocate and copy to emscripten's heap
    new Uint8Array(Module.HEAPU8.buffer, this.dataPtr + 72, 64).set( new Uint8Array(data2.buffer, data2.byteOffset, 64) );

    // multiply first two parameters and return in the last one 
    this.mmul_vec4(this.dataPtr, this.dataPtr + 72, this.dataPtr + 144);

    dataTarget.set(new Float32Array(Module.HEAPU8.buffer, this.dataPtr + 144, 16));
};

Original Three.js功能

multiplyMatrices: function ( a, b ) {

    var ae = a.elements;
    var be = b.elements;
    var te = this.elements;

    var a11 = ae[ 0 ], a12 = ae[ 4 ], a13 = ae[ 8 ], a14 = ae[ 12 ];
    var a21 = ae[ 1 ], a22 = ae[ 5 ], a23 = ae[ 9 ], a24 = ae[ 13 ];
    var a31 = ae[ 2 ], a32 = ae[ 6 ], a33 = ae[ 10 ], a34 = ae[ 14 ];
    var a41 = ae[ 3 ], a42 = ae[ 7 ], a43 = ae[ 11 ], a44 = ae[ 15 ];

    var b11 = be[ 0 ], b12 = be[ 4 ], b13 = be[ 8 ], b14 = be[ 12 ];
    var b21 = be[ 1 ], b22 = be[ 5 ], b23 = be[ 9 ], b24 = be[ 13 ];
    var b31 = be[ 2 ], b32 = be[ 6 ], b33 = be[ 10 ], b34 = be[ 14 ];
    var b41 = be[ 3 ], b42 = be[ 7 ], b43 = be[ 11 ], b44 = be[ 15 ];

    te[ 0 ] = a11 * b11 + a12 * b21 + a13 * b31 + a14 * b41;
    te[ 4 ] = a11 * b12 + a12 * b22 + a13 * b32 + a14 * b42;
    te[ 8 ] = a11 * b13 + a12 * b23 + a13 * b33 + a14 * b43;
    te[ 12 ] = a11 * b14 + a12 * b24 + a13 * b34 + a14 * b44;

    te[ 1 ] = a21 * b11 + a22 * b21 + a23 * b31 + a24 * b41;
    te[ 5 ] = a21 * b12 + a22 * b22 + a23 * b32 + a24 * b42;
    te[ 9 ] = a21 * b13 + a22 * b23 + a23 * b33 + a24 * b43;
    te[ 13 ] = a21 * b14 + a22 * b24 + a23 * b34 + a24 * b44;

    te[ 2 ] = a31 * b11 + a32 * b21 + a33 * b31 + a34 * b41;
    te[ 6 ] = a31 * b12 + a32 * b22 + a33 * b32 + a34 * b42;
    te[ 10 ] = a31 * b13 + a32 * b23 + a33 * b33 + a34 * b43;
    te[ 14 ] = a31 * b14 + a32 * b24 + a33 * b34 + a34 * b44;

    te[ 3 ] = a41 * b11 + a42 * b21 + a43 * b31 + a44 * b41;
    te[ 7 ] = a41 * b12 + a42 * b22 + a43 * b32 + a44 * b42;
    te[ 11 ] = a41 * b13 + a42 * b23 + a43 * b33 + a44 * b43;
    te[ 15 ] = a41 * b14 + a42 * b24 + a43 * b34 + a44 * b44;

    return this;

},

0 个答案:

没有答案