Hullo,我对装配知之甚少 我正在考虑如何通过重写它来优化它 在x86(32位fpu或sse2)程序集中,应该是 优化 - 在正确的装配中重写,然后我将测试是否 我有一些加速(长度()和点()应该是 也写在asm这里。这个代码由我使用 简单的实时光线跟踪器,它的工作原理 - 但我并不多 擅长asm优化)
inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
{
static float3 Q;
Q = sub(sO,rO);
float c = length(&Q);
float v = dot(&Q,rV);
float d = sR*sR - (c*c - v*v);
// If there was no intersection, return -1
if (d < 0.0) return (-1.0f);
// Return the distance to the [first] intersecting point
return (v - sqrt(d));
}
提前感谢
//编辑
struct float3
{
float x;
float y;
float z;
};
inline float length(float3* v) {
return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
}
inline float dot(float3* a, float3* b) {
return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
}
和demo exe(unoptymized甚至没有那么多optymized c):
dl.dropbox.com/u/42887985/re29.zip
也许有人可以给我一些不错的fpu asm例程 点(或标准化未在此处显示)??虽然整个过程相交 程序将是最好的; - )
答案 0 :(得分:2)
__asm
{
movaps xmm0,[float3] //this is vector of yours into xmm0
mulps xmm0,xmm0 //this is each term squared
pxor xmm1,xmm1 //clean xmm1 first
movlhps xmm1,xmm0 //lower 2 terms to the higher 2 parts of xmm1
addps xmm0,xmm1 //higher 2 terms of xmm0 now has x_square+z_square and y_square + zero_square
shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
addps xmm0,xmm2 //now we have sum of all squares in highest of xmm0
shufps xmm0,xmm0,11111111b // copy result to all 4 parts
sqrtss xmm0,xmm0 //scalar square-root
movaps [result],xmm0
}
这可能比完全优化慢,但应足够快以进行矢量长度计算。需要矢量对齐 - 16字节。如果您不想对齐,请将movaps更改为movup。如果您可以使用此代码,那么您可以通过添加
来提高性能align 16
在movaps xmm0的开头,[float3]使代码也对齐。然后你可以检查每个实例有多少字节。尝试达到最佳代码长度(16个字节的倍数)。在sse2(sse3,sse4,avx)之后,存在垂直水平向量指令,其仅产生1条指令以获得结果。
在第二条指令中编辑mm0,xmm0到xmm0,xmm0
这里有一些清单:
答案 1 :(得分:2)
这不是转换为SSE的“好”功能。几乎没有什么是实际上平行的。所以让我们改变函数一次交叉4条光线。如果光线存储在SOA (struct of arrays)而不是AOS(结构数组)中,这将有所帮助。
通过这些更改,它可能会变成这样(不以任何方式测试):
inline void intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
// calculate Q
movss xmm0, sOx
movss xmm1, sOy
movss xmm2, sOz
shufps xmm0, xmm0, 0
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
subps xmm0, [rOx]
subps xmm1, [rOy]
subps xmm2, [rOz]
// calculate pow(dot(Q, rV), 2) in xmm3
movaps xmm3, [rVx]
movaps xmm4, [rVy]
movaps xmm5, [rVz]
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
addps xmm3, xmm4
addps xmm3, xmm5
movaps xmm4, xmm3
mulps xmm3, xmm3
// calculate pow(length(Q), 2)
// there's no point in taking the square root only to then square it
mulps xmm0, xmm0
mulps xmm1, xmm1
mulps xmm2, xmm2
addps xmm0, xmm1
addps xmm0, xmm2
// calculate d
movss xmm1, sR
mulss xmm1, xmm1
shufps xmm1, xmm1, 0
subps xmm0, xmm3
subps xmm1, xmm0
sqrtps xmm1, xmm1
// test for intersection
// at this point:
// xmm3 = v * v
// xmm4 = v
// xmm1 = sqrt(d)
movaps xmm0, [minus1] // memory location with { -1.0, -1.0, -1.0, -1.0 }
subps xmm4, xmm1
// get a mask of d's smaller than 0.0
psrad xmm1, 31
// select -1 if less than zero or v*v - d if >= 0
andps xmm0, xmm1
andnps xmm1, xmm4
orps xmm0, xmm1
ret
}
带内在函数的版本(仅经过轻微测试 - 可编译,似乎生成OK程序集):
__m128 intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
__m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
__m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
__m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
__m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
_mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
_mm_mul_ps(Qz, _mm_load_ps(rVz))));
__m128 vsquared = _mm_mul_ps(v, v);
__m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
_mm_add_ps(_mm_mul_ps(Qy, Qy),
_mm_mul_ps(Qz, Qz)));
__m128 sr = _mm_set1_ps(sR);
__m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
__m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
//__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
return result;
}