I tried使用透明比较器使用原生 STL 树在飞机上生成Fortune's algorithm来实现Voronoi diagram。目前发现的几乎所有错误和实现的渐近复杂性都是O(N * log(N))
。现在我想尝试做一些优化。使用分析结果(我使用Google Performance Tools中的pprof
)我得出结论,有两个非常热门的函数。并且它们都表达了非常惯用的数学算法:计算两个抛物线弧的纵坐标。给定公共准线的交点和计算中心和外接圆半径3点。第二个是有意义的。 3点(站点的 Voronoi图)的外接圆表示 Voronoi图顶点,因此称为"圆形事件&#34 ;就财富算法而言。 顶点查找的函数(基于these formulae)如下:
using value_type = double;
struct alignas(__m128d) point
{
value_type x, y;
};
std::experimental::optional< vertex >
make_vertex(point const & a,
point const & b,
point const & c)
{
#if 1
value_type const A = a.x * a.x + a.y * a.y;
value_type const B = b.x * b.x + b.y * b.y;
value_type const C = c.x * c.x + c.y * c.y;
point const ca = {a.x - c.x, a.y - c.y};
point const cb = {b.x - c.x, b.y - c.y};
value_type const CA = A - C;
value_type const CB = B - C;
value_type x = CA * cb.y - CB * ca.y;
value_type y = ca.x * CB - cb.x * CA;
value_type alpha = ca.x * cb.y - ca.y * cb.x;
if (!(eps < -alpha)) {
return {};
}
value_type beta = a.x * (b.y * C - c.y * B) - b.x * (a.y * C - c.y * A) + c.x * (a.y * B - b.y * A);
beta /= alpha;
alpha += alpha;
x /= alpha;
y /= alpha;
using std::sqrt;
value_type const R = sqrt(beta + x * x + y * y);
return {{{x, y}, R}};
#else
__m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
__m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
__m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
__m256d A = _mm256_mul_pd(a_, a_);
A = _mm256_hadd_pd(A, A);
__m256d B = _mm256_mul_pd(b_, b_);
B = _mm256_hadd_pd(B, B);
__m256d C = _mm256_mul_pd(c_, c_);
C = _mm256_hadd_pd(C, C);
__m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
__m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
__m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
xxyy = _mm256_hsub_pd(xxyy, xxyy);
__m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
__m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
alpha = _mm256_hsub_pd(alpha, alpha);
if (!(alpha[0] < -eps)) {
return {};
}
__m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
__m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
__m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
bacc = _mm256_div_pd(bacc, alpha);
xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
__m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
beta = _mm256_hadd_pd(beta, beta);
beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
return {{{xyxy[0], xyxy[1]}, beta[0]}};
#endif
}
正如您所看到的,我尝试使用针对Intel AVX指令的内在函数重新实现此功能。 #if 1
和#if 0
部分都完全等效且正确。 #if 0
大多数情况下使用全宽 ymm 寄存器。只有在最后,一些变量的高半部分才包含垃圾。
我为一个巨大的输入做了几个测试(在飞机的有限部分上均匀分布了100000个点):一个用于#if 1
,另一个用于#if 0
。第一个提供更好的运行时间(0.28秒对比许多尝试的平均0.31秒)。我查看了#if 1
的反汇编:
push %rbx
sub $0x10,%rsp
mov %rdi,%rbx
319 [1] value_type const A = a.x * a.x + a.y * a.y;
vmovsd (%rdx),%xmm11
vmovsd 0x8(%rdx),%xmm0
320 [1] value_type const B = b.x * b.x + b.y * b.y;
vmovsd (%rcx),%xmm9
vmovsd 0x8(%rcx),%xmm2
321 [1] value_type const C = c.x * c.x + c.y * c.y;
vmovsd (%r8),%xmm8
vmovsd 0x8(%r8),%xmm6
322 [1] point const ca = {a.x - c.x, a.y - c.y};
vunpcklpd %xmm9,%xmm0,%xmm1
vunpcklpd %xmm8,%xmm6,%xmm5
vsubpd %xmm5,%xmm1,%xmm7
323 [1] point const cb = {b.x - c.x, b.y - c.y};
vunpcklpd %xmm11,%xmm2,%xmm1
vsubpd %xmm5,%xmm1,%xmm1
328 [1] value_type alpha = ca.x * cb.y - ca.y * cb.x;
vpermilpd $0x1,%xmm1,%xmm5
vmulsd %xmm1,%xmm5,%xmm5
vpermilpd $0x1,%xmm7,%xmm3
vmulsd %xmm7,%xmm3,%xmm3
vsubsd %xmm3,%xmm5,%xmm12
329 [1] if (!(eps < -alpha)) {
mov (%rsi),%rax
vxorpd 0x1cd1(%rip),%xmm12,%xmm3 # 0x40e680
vucomisd (%rax),%xmm3
jbe 0x40ca85 <make_vertex()+309>
319 [2] value_type const A = a.x * a.x + a.y * a.y;
vunpcklpd %xmm9,%xmm11,%xmm3
vmulpd %xmm3,%xmm3,%xmm10
vunpcklpd %xmm2,%xmm0,%xmm3
vmulpd %xmm3,%xmm3,%xmm3
vaddpd %xmm3,%xmm10,%xmm3
321 [2] value_type const C = c.x * c.x + c.y * c.y;
vmulsd %xmm8,%xmm8,%xmm10
vmulsd %xmm6,%xmm6,%xmm4
vaddsd %xmm4,%xmm10,%xmm4
324 [1] value_type const CA = A - C;
vmovddup %xmm4,%xmm5
vsubpd %xmm5,%xmm3,%xmm5
326 [1] value_type x = CA * cb.y - CB * ca.y;
vmulpd %xmm5,%xmm1,%xmm1
vpermilpd $0x1,%xmm5,%xmm5
vmulpd %xmm5,%xmm7,%xmm5
vsubpd %xmm5,%xmm1,%xmm10
332 [1] value_type beta = a.x * (b.y * C - c.y * B) + b.x * (c.y * A - a.y * C) + c.x * (a.y * B - b.y * A);
vmulsd %xmm4,%xmm2,%xmm5
vpermilpd $0x1,%xmm3,%xmm7
vmulsd %xmm7,%xmm6,%xmm1
vsubsd %xmm1,%xmm5,%xmm1
vmulsd %xmm1,%xmm11,%xmm1
vmulsd %xmm6,%xmm3,%xmm5
vmulsd %xmm4,%xmm0,%xmm4
vsubsd %xmm4,%xmm5,%xmm4
vmulsd %xmm4,%xmm9,%xmm4
vaddsd %xmm4,%xmm1,%xmm1
vmulsd %xmm7,%xmm0,%xmm0
vmulsd %xmm3,%xmm2,%xmm2
vsubsd %xmm2,%xmm0,%xmm0
vmulsd %xmm0,%xmm8,%xmm0
vaddsd %xmm1,%xmm0,%xmm0
333 [1] beta /= alpha;
vdivsd %xmm12,%xmm0,%xmm0
334 [1] alpha += alpha;
vaddsd %xmm12,%xmm12,%xmm1
335 [1] x /= alpha;
vmovddup %xmm1,%xmm1
vdivpd %xmm1,%xmm10,%xmm2
338 [1] value_type const R = sqrt(beta + x * x + y * y);
vmulsd %xmm2,%xmm2,%xmm1
vaddsd %xmm0,%xmm1,%xmm0
vpermilpd $0x1,%xmm2,%xmm1
vmulsd %xmm1,%xmm1,%xmm1
vaddsd %xmm0,%xmm1,%xmm1
vsqrtsd %xmm1,%xmm1,%xmm0
370 [1] }
%rbx,%rax
$0x10,%rsp
%rbx
retq
和#if 0
:
317 [1] {
vmovupd (%rdx),%xmm2
vmovupd (%rcx),%xmm1
vmovupd (%r8),%xmm4
350 [1] __m256d byayaxbx = _mm256_permute_pd(_mm256_shuffle_pd(_mm256_sub_pd(a_, c_), _mm256_sub_pd(b_, c_), 0b0011), 0b1001);
vsubpd %xmm4,%xmm2,%xmm0
vinsertf128 $0x1,%xmm0,%ymm0,%ymm0
vsubpd %xmm4,%xmm1,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
vpermilpd $0x1,%ymm3,%ymm3
vblendpd $0x6,%ymm0,%ymm3,%ymm5
355 [1] __m256d alpha = _mm256_mul_pd(byayaxbx, _mm256_permute2f128_pd(byayaxbx, byayaxbx, 0x01));
vperm2f128 $0x1,%ymm0,%ymm5,%ymm0
vmulpd %ymm0,%ymm5,%ymm0
356 [1] alpha = _mm256_hsub_pd(alpha, alpha);
vhsubpd %ymm0,%ymm0,%ymm0
357 [1] if (!(alpha[0] < -eps)) {
mov (%rsi),%rax
vmovsd (%rax),%xmm3
vxorpd 0x1cc6(%rip),%xmm3,%xmm3 # 0x40e660
vucomisd %xmm0,%xmm3
jbe 0x40ca6d <make_vertex()+285>
343 [1] __m256d c_ = _mm256_broadcast_pd((__m128d *)&c);
vinsertf128 $0x1,%xmm4,%ymm4,%ymm6
344 [1] __m256d A = _mm256_mul_pd(a_, a_);
vmulpd %xmm2,%xmm2,%xmm3
vinsertf128 $0x1,%xmm3,%ymm3,%ymm3
345 [1] A = _mm256_hadd_pd(A, A);
vhaddpd %ymm3,%ymm3,%ymm3
346 [1] __m256d B = _mm256_mul_pd(b_, b_);
vmulpd %xmm1,%xmm1,%xmm7
vinsertf128 $0x1,%xmm7,%ymm7,%ymm7
347 [1] B = _mm256_hadd_pd(B, B);
vhaddpd %ymm7,%ymm7,%ymm7
348 [1] __m256d C = _mm256_mul_pd(c_, c_);
vmulpd %xmm4,%xmm4,%xmm4
vinsertf128 $0x1,%xmm4,%ymm4,%ymm4
349 [1] C = _mm256_hadd_pd(C, C);
vhaddpd %ymm4,%ymm4,%ymm4
351 [1] __m256d ABBA = _mm256_permute_pd(_mm256_sub_pd(_mm256_shuffle_pd(A, B, 0), C), 0b0110);
vunpcklpd %ymm7,%ymm3,%ymm8
vsubpd %ymm4,%ymm8,%ymm8
vpermilpd $0x6,%ymm8,%ymm8
352 [1] __m256d xxyy = _mm256_mul_pd(byayaxbx, ABBA);
vmulpd %ymm8,%ymm5,%ymm5
353 [1] xxyy = _mm256_hsub_pd(xxyy, xxyy);
vhsubpd %ymm5,%ymm5,%ymm5
342 [1] __m256d b_ = _mm256_broadcast_pd((__m128d *)&b);
vinsertf128 $0x1,%xmm1,%ymm1,%ymm8
341 [1] __m256d a_ = _mm256_broadcast_pd((__m128d *)&a);
vinsertf128 $0x1,%xmm2,%ymm2,%ymm2
354 [1] __m256d xyxy = _mm256_shuffle_pd(xxyy, _mm256_permute2f128_pd(xxyy, xxyy, 0x01), 0);
vperm2f128 $0x23,%ymm5,%ymm0,%ymm9
vunpcklpd %ymm9,%ymm5,%ymm5
360 [1] __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a_, 0b1001), _mm256_permute2f128_pd(c_, _mm256_permute_pd(b_, 0b01), 0x20));
vpermilpd $0x9,%ymm2,%ymm2
vpermilpd $0x1,%xmm1,%xmm1
vinsertf128 $0x1,%xmm1,%ymm6,%ymm1
vmulpd %ymm1,%ymm2,%ymm1
361 [1] __m256d tmp2 = _mm256_mul_pd(b_, _mm256_permute_pd(c_, 0b01));
vpermilpd $0x1,%ymm6,%ymm2
vmulpd %ymm2,%ymm8,%ymm2
362 [1] __m256d bacc = _mm256_permute_pd(_mm256_hsub_pd(_mm256_mul_pd(tmp1, _mm256_permute2f128_pd(B, C, 0x20)), _mm256_mul_pd(tmp2, A)), 0b0010);
vinsertf128 $0x1,%xmm4,%ymm7,%ymm4
vmulpd %ymm4,%ymm1,%ymm1
vmulpd %ymm2,%ymm3,%ymm2
vhsubpd %ymm2,%ymm1,%ymm1
vpermilpd $0x2,%ymm1,%ymm1
363 [1] bacc = _mm256_div_pd(bacc, alpha);
vdivpd %ymm0,%ymm1,%ymm1
364 [1] xyxy = _mm256_div_pd(xyxy, _mm256_add_pd(alpha, alpha));
vaddpd %ymm0,%ymm0,%ymm0
vdivpd %ymm0,%ymm5,%ymm0
365 [1] __m256d beta = _mm256_hadd_pd(bacc, _mm256_mul_pd(xyxy, xyxy));
vmulpd %ymm0,%ymm0,%ymm2
vhaddpd %ymm2,%ymm1,%ymm2
366 [1] beta = _mm256_hadd_pd(beta, beta);
vhaddpd %ymm2,%ymm2,%ymm2
367 [1] beta = _mm256_sqrt_pd(_mm256_add_pd(_mm256_permute2f128_pd(bacc, bacc, 0x01), beta));
vperm2f128 $0x1,%ymm0,%ymm1,%ymm1
vaddpd %ymm1,%ymm2,%ymm1
vsqrtpd %ymm1,%ymm1
370 [1] }
mov %rdi,%rax
vzeroupper
retq
我在clang version 4.0.0 (trunk 282862)
上使用-O3 -march=native -mavx
关键命令行键Intel(R) Core(TM) i7-2670QM CPU
。
我的主要误解是什么?
我应该使用内联汇编来改善控制吗?