使用AVX内在函数的圆周参数

时间:2016-11-15 10:13:56

标签: x86 clang c++14 avx voronoi

I tried使用透明比较器使用原生 STL 树在飞机上生成Fortune's algorithm来实现Voronoi diagram。目前发现的几乎所有错误和实现的渐近复杂性都是O(N * log(N))。现在我想尝试做一些优化。使用分析结果(我使用Google Performance Tools中的pprof)我得出结论,有两个非常热门的函数。并且它们都表达了非常惯用的数学算法:计算两个抛物线弧的纵坐标。给定公共准线的交点计算中心和外接圆半径3点。第二个是有意义的。 3点(站点 Voronoi图)的外接圆表示 Voronoi图顶点,因此称为"圆形事件&#34 ;就财富算法而言。 顶点查找的函数(基于these formulae)如下:

using value_type = double;

struct alignas(__m128d) point
{
    value_type x, y;
};

std::experimental::optional< vertex >
make_vertex(point const & a,
            point const & b,
            point const & c)
{
#if 1
    value_type const A = a.x * a.x + a.y * a.y;
    value_type const B = b.x * b.x + b.y * b.y;
    value_type const C = c.x * c.x + c.y * c.y;
    point const ca = {a.x - c.x, a.y - c.y};
    point const cb = {b.x - c.x, b.y - c.y};
    value_type const CA = A - C;
    value_type const CB = B - C;
    value_type x = CA * cb.y - CB * ca.y;
    value_type y = ca.x * CB - cb.x * CA;
    value_type alpha = ca.x * cb.y - ca.y * cb.x;
    if (!(eps < -alpha)) {
        return {};
    }
    value_type beta = a.x * (b.y * C - c.y * B) - b.x * (a.y * C - c.y * A) + c.x * (a.y * B - b.y * A);
    beta /= alpha;
    alpha += alpha;
    x /= alpha;
    y /= alpha;
    using std::sqrt;
    value_type const R = sqrt(beta + x * x + y * y);
    return {{{x, y}, R}};
#else
    __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
    __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
    __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
    __m256d A = _mm256_mul_pd(a_, a_);
    A = _mm256_hadd_pd(A, A);
    __m256d B = _mm256_mul_pd(b_, b_);
    B = _mm256_hadd_pd(B, B);
    __m256d C = _mm256_mul_pd(c_, c_);
    C = _mm256_hadd_pd(C, C);
    __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
    __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
    __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
    xxyy = _mm256_hsub_pd(xxyy, xxyy);
    __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
    __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
    alpha = _mm256_hsub_pd(alpha, alpha);
    if (!(alpha[0] < -eps)) {
        return {};
    }
    __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
    __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
    __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
    bacc = _mm256_div_pd(bacc, alpha);
    xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
    __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
    beta = _mm256_hadd_pd(beta, beta);
    beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
    return {{{xyxy[0], xyxy[1]}, beta[0]}};
#endif
}

正如您所看到的,我尝试使用针对Intel AVX指令的内在函数重新实现此功能。 #if 1#if 0部分都完全等效且正确。 #if 0大多数情况下使用全宽 ymm 寄存器。只有在最后,一些变量的高半部分才包含垃圾。

我为一个巨大的输入做了几个测试(在飞机的有限部分上均匀分布了100000个点):一个用于#if 1,另一个用于#if 0。第一个提供更好的运行时间(0.28秒对比许多尝试的平均0.31秒)。我查看了#if 1的反汇编:

push   %rbx
sub    $0x10,%rsp
mov    %rdi,%rbx
        319 [1]            value_type const A = a.x * a.x + a.y * a.y;
vmovsd (%rdx),%xmm11
vmovsd 0x8(%rdx),%xmm0
        320 [1]            value_type const B = b.x * b.x + b.y * b.y;
vmovsd (%rcx),%xmm9
vmovsd 0x8(%rcx),%xmm2
        321 [1]            value_type const C = c.x * c.x + c.y * c.y;
vmovsd (%r8),%xmm8
vmovsd 0x8(%r8),%xmm6
        322 [1]            point const ca = {a.x - c.x, a.y - c.y};
vunpcklpd %xmm9,%xmm0,%xmm1
vunpcklpd %xmm8,%xmm6,%xmm5
vsubpd %xmm5,%xmm1,%xmm7
        323 [1]            point const cb = {b.x - c.x, b.y - c.y};
vunpcklpd %xmm11,%xmm2,%xmm1
vsubpd %xmm5,%xmm1,%xmm1
        328 [1]            value_type alpha = ca.x * cb.y - ca.y * cb.x;
vpermilpd $0x1,%xmm1,%xmm5
vmulsd %xmm1,%xmm5,%xmm5
vpermilpd $0x1,%xmm7,%xmm3
vmulsd %xmm7,%xmm3,%xmm3
vsubsd %xmm3,%xmm5,%xmm12
        329 [1]            if (!(eps < -alpha)) {
mov    (%rsi),%rax
vxorpd 0x1cd1(%rip),%xmm12,%xmm3        # 0x40e680
vucomisd (%rax),%xmm3
jbe    0x40ca85 <make_vertex()+309>
        319 [2]            value_type const A = a.x * a.x + a.y * a.y;
vunpcklpd %xmm9,%xmm11,%xmm3
vmulpd %xmm3,%xmm3,%xmm10
vunpcklpd %xmm2,%xmm0,%xmm3
vmulpd %xmm3,%xmm3,%xmm3
vaddpd %xmm3,%xmm10,%xmm3
        321 [2]            value_type const C = c.x * c.x + c.y * c.y;
vmulsd %xmm8,%xmm8,%xmm10
vmulsd %xmm6,%xmm6,%xmm4
vaddsd %xmm4,%xmm10,%xmm4
        324 [1]            value_type const CA = A - C;
vmovddup %xmm4,%xmm5
vsubpd %xmm5,%xmm3,%xmm5
        326 [1]            value_type x = CA * cb.y - CB * ca.y;
vmulpd %xmm5,%xmm1,%xmm1
vpermilpd $0x1,%xmm5,%xmm5
vmulpd %xmm5,%xmm7,%xmm5
vsubpd %xmm5,%xmm1,%xmm10
        332 [1]            value_type beta = a.x * (b.y * C - c.y * B) + b.x * (c.y * A - a.y * C) + c.x * (a.y * B - b.y * A);
vmulsd %xmm4,%xmm2,%xmm5
vpermilpd $0x1,%xmm3,%xmm7
vmulsd %xmm7,%xmm6,%xmm1
vsubsd %xmm1,%xmm5,%xmm1
vmulsd %xmm1,%xmm11,%xmm1
vmulsd %xmm6,%xmm3,%xmm5
vmulsd %xmm4,%xmm0,%xmm4
vsubsd %xmm4,%xmm5,%xmm4
vmulsd %xmm4,%xmm9,%xmm4
vaddsd %xmm4,%xmm1,%xmm1
vmulsd %xmm7,%xmm0,%xmm0
vmulsd %xmm3,%xmm2,%xmm2
vsubsd %xmm2,%xmm0,%xmm0
vmulsd %xmm0,%xmm8,%xmm0
vaddsd %xmm1,%xmm0,%xmm0
        333 [1]            beta /= alpha;
vdivsd %xmm12,%xmm0,%xmm0
        334 [1]            alpha += alpha;
vaddsd %xmm12,%xmm12,%xmm1
        335 [1]            x /= alpha;
vmovddup %xmm1,%xmm1
vdivpd %xmm1,%xmm10,%xmm2
        338 [1]            value_type const R = sqrt(beta + x * x + y * y);
vmulsd %xmm2,%xmm2,%xmm1
vaddsd %xmm0,%xmm1,%xmm0
vpermilpd $0x1,%xmm2,%xmm1
vmulsd %xmm1,%xmm1,%xmm1
vaddsd %xmm0,%xmm1,%xmm1
vsqrtsd %xmm1,%xmm1,%xmm0
        370 [1]        }
%rbx,%rax
$0x10,%rsp
%rbx
retq

#if 0

        317 [1]        {
vmovupd (%rdx),%xmm2
vmovupd (%rcx),%xmm1
vmovupd (%r8),%xmm4
        350 [1]            __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
vsubpd %xmm4,%xmm2,%xmm0
vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
vsubpd %xmm4,%xmm1,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
vpermilpd $0x1,%ymm3,%ymm3
vblendpd $0x6,%ymm0,%ymm3,%ymm5
        355 [1]            __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
vperm2f128 $0x1,%ymm0,%ymm5,%ymm0
vmulpd %ymm0,%ymm5,%ymm0
        356 [1]            alpha = _mm256_hsub_pd(alpha, alpha);
vhsubpd %ymm0,%ymm0,%ymm0
        357 [1]            if (!(alpha[0] < -eps)) {
mov    (%rsi),%rax
vmovsd (%rax),%xmm3
vxorpd 0x1cc6(%rip),%xmm3,%xmm3        # 0x40e660
vucomisd %xmm0,%xmm3
jbe    0x40ca6d <make_vertex()+285>
        343 [1]            __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
vinsertf128 $0x1,%xmm4,%ymm4,%ymm6
        344 [1]            __m256d A = _mm256_mul_pd(a_, a_);
vmulpd %xmm2,%xmm2,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
        345 [1]            A = _mm256_hadd_pd(A, A);
vhaddpd %ymm3,%ymm3,%ymm3
        346 [1]            __m256d B = _mm256_mul_pd(b_, b_);
vmulpd %xmm1,%xmm1,%xmm7
vinsertf128 $0x1,%xmm7,%ymm7,%ymm7
        347 [1]            B = _mm256_hadd_pd(B, B);
vhaddpd %ymm7,%ymm7,%ymm7
        348 [1]            __m256d C = _mm256_mul_pd(c_, c_);
vmulpd %xmm4,%xmm4,%xmm4
vinsertf128 $0x1,%xmm4,%ymm4,%ymm4
        349 [1]            C = _mm256_hadd_pd(C, C);
vhaddpd %ymm4,%ymm4,%ymm4
        351 [1]            __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
vunpcklpd %ymm7,%ymm3,%ymm8
vsubpd %ymm4,%ymm8,%ymm8
vpermilpd $0x6,%ymm8,%ymm8
        352 [1]            __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
vmulpd %ymm8,%ymm5,%ymm5
        353 [1]            xxyy = _mm256_hsub_pd(xxyy, xxyy);
vhsubpd %ymm5,%ymm5,%ymm5
        342 [1]            __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
        341 [1]            __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
        354 [1]            __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
vperm2f128 $0x23,%ymm5,%ymm0,%ymm9
vunpcklpd %ymm9,%ymm5,%ymm5
        360 [1]            __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
vpermilpd $0x9,%ymm2,%ymm2
vpermilpd $0x1,%xmm1,%xmm1
vinsertf128 $0x1,%xmm1,%ymm6,%ymm1
vmulpd %ymm1,%ymm2,%ymm1
        361 [1]            __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
vpermilpd $0x1,%ymm6,%ymm2
vmulpd %ymm2,%ymm8,%ymm2
        362 [1]            __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
vinsertf128 $0x1,%xmm4,%ymm7,%ymm4
vmulpd %ymm4,%ymm1,%ymm1
vmulpd %ymm2,%ymm3,%ymm2
vhsubpd %ymm2,%ymm1,%ymm1
vpermilpd $0x2,%ymm1,%ymm1
        363 [1]            bacc = _mm256_div_pd(bacc, alpha);
vdivpd %ymm0,%ymm1,%ymm1
        364 [1]            xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
vaddpd %ymm0,%ymm0,%ymm0
vdivpd %ymm0,%ymm5,%ymm0
        365 [1]            __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
vmulpd %ymm0,%ymm0,%ymm2
vhaddpd %ymm2,%ymm1,%ymm2
        366 [1]            beta = _mm256_hadd_pd(beta, beta);
vhaddpd %ymm2,%ymm2,%ymm2
        367 [1]            beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
vperm2f128 $0x1,%ymm0,%ymm1,%ymm1
vaddpd %ymm1,%ymm2,%ymm1
vsqrtpd %ymm1,%ymm1
        370 [1]        }
mov    %rdi,%rax
vzeroupper
retq

我在clang version 4.0.0 (trunk 282862)上使用-O3 -march=native -mavx关键命令行键Intel(R) Core(TM) i7-2670QM CPU

我的主要误解是什么?

我应该使用内联汇编来改善控制吗?

0 个答案:

没有答案