Question

Hullo，我对装配知之甚少我正在考虑如何通过重写它来优化它在x86（32位fpu或sse2）程序集中，应该是优化 - 在正确的装配中重写，然后我将测试是否我有一些加速（长度（）和点（）应该是也写在asm这里。这个代码由我使用简单的实时光线跟踪器，它的工作原理 - 但我并不多擅长asm优化）

    inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
   {
    static float3 Q;

    Q = sub(sO,rO);
    float c = length(&Q);
    float v = dot(&Q,rV);
    float d = sR*sR - (c*c - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);

    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

提前感谢

//编辑

    struct float3
    {
     float x;
     float y;
     float z;
    };


    inline float length(float3* v) {
     return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
    }

   inline float dot(float3* a, float3* b) {
     return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
   }

和demo exe（unoptymized甚至没有那么多optymized c）：

dl.dropbox.com/u/42887985/re29.zip

也许有人可以给我一些不错的fpu asm例程点（或标准化未在此处显示）??虽然整个过程相交程序将是最好的; - ）

Answer 1

__asm
    {
    movaps xmm0,[float3] //this is vector of yours into xmm0
    mulps xmm0,xmm0       //this is each term squared
    pxor xmm1,xmm1       //clean xmm1 first
    movlhps xmm1,xmm0    //lower 2 terms to the higher 2 parts of xmm1
    addps xmm0,xmm1      //higher 2 terms of xmm0 now has x_square+z_square and  y_square + zero_square
    shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
    addps xmm0,xmm2     //now we have sum of all squares in highest of xmm0
    shufps xmm0,xmm0,11111111b // copy result to all 4 parts
    sqrtss xmm0,xmm0           //scalar square-root
    movaps [result],xmm0
    }

这可能比完全优化慢，但应足够快以进行矢量长度计算。需要矢量对齐 - 16字节。如果您不想对齐，请将movaps更改为movup。如果您可以使用此代码，那么您可以通过添加

来提高性能

align 16

在movaps xmm0的开头，[float3]使代码也对齐。然后你可以检查每个实例有多少字节。尝试达到最佳代码长度（16个字节的倍数）。在sse2（sse3，sse4，avx）之后，存在垂直水平向量指令，其仅产生1条指令以获得结果。

在第二条指令中编辑mm0，xmm0到xmm0，xmm0

这里有一些清单：

http://softpixel.com/~cwright/programming/simd/sse2.php

Answer 2

这不是转换为SSE的“好”功能。几乎没有什么是实际上平行的。所以让我们改变函数一次交叉4条光线。如果光线存储在SOA (struct of arrays)而不是AOS（结构数组）中，这将有所帮助。

通过这些更改，它可能会变成这样（不以任何方式测试）：

inline void intersect4RaysSphere(
 float* rOx, float* rOy, float* rOz,
 float* rVx, float* rVy, float* rVz,
 float sOx, float sOy, float sOz,
 float sR)
{
    // calculate Q
    movss xmm0, sOx
    movss xmm1, sOy
    movss xmm2, sOz
    shufps xmm0, xmm0, 0
    shufps xmm1, xmm1, 0
    shufps xmm2, xmm2, 0
    subps xmm0, [rOx]
    subps xmm1, [rOy]
    subps xmm2, [rOz]
    // calculate pow(dot(Q, rV), 2) in xmm3
    movaps xmm3, [rVx]
    movaps xmm4, [rVy]
    movaps xmm5, [rVz]
    mulps xmm3, xmm0
    mulps xmm4, xmm1
    mulps xmm5, xmm2
    addps xmm3, xmm4
    addps xmm3, xmm5
    movaps xmm4, xmm3
    mulps xmm3, xmm3
    // calculate pow(length(Q), 2)
    // there's no point in taking the square root only to then square it
    mulps xmm0, xmm0
    mulps xmm1, xmm1
    mulps xmm2, xmm2
    addps xmm0, xmm1
    addps xmm0, xmm2
    // calculate d
    movss xmm1, sR
    mulss xmm1, xmm1
    shufps xmm1, xmm1, 0
    subps xmm0, xmm3
    subps xmm1, xmm0
    sqrtps xmm1, xmm1
    // test for intersection
    // at this point:
    // xmm3 = v * v
    // xmm4 = v
    // xmm1 = sqrt(d)
    movaps xmm0, [minus1]  // memory location with { -1.0, -1.0, -1.0, -1.0 }
    subps xmm4, xmm1
    // get a mask of d's smaller than 0.0
    psrad xmm1, 31
    // select -1 if less than zero or v*v - d if >= 0
    andps xmm0, xmm1
    andnps xmm1, xmm4
    orps xmm0, xmm1
    ret
}

带内在函数的版本（仅经过轻微测试 - 可编译，似乎生成OK程序集）：

__m128 intersect4RaysSphere(
     float* rOx, float* rOy, float* rOz,
     float* rVx, float* rVy, float* rVz,
     float sOx, float sOy, float sOz,
     float sR)
{
    __m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
    __m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
    __m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
    __m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
               _mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
                          _mm_mul_ps(Qz, _mm_load_ps(rVz))));
    __m128 vsquared = _mm_mul_ps(v, v);
    __m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
                            _mm_add_ps(_mm_mul_ps(Qy, Qy),
                                       _mm_mul_ps(Qz, Qz)));
    __m128 sr = _mm_set1_ps(sR);
    __m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
    __m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
    //__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
    __m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
    return result;
}

一些intersectRaySphere c程序optymization通过将其重写为x86 asm（如何？）

2 个答案: