在我的项目中,我有一个函数,出于性能原因,应有条件地跳过代码路径。
如果条件是真的,我的增幅将达到预期的50%。
但是,如果条件为假,则最坏情况下正常路径的性能会降低30%。因为该算法经过数百个循环,所以我不明白为什么简单的附加if
子句会产生如此大的影响。
该功能是项目FFmpeg.org中libavfilter / vf_fillborders.c的一部分
static void mirror_borders16(FillBordersContext *s, AVFrame *frame)
{
for (int p = 0; p < s->nb_planes; p++) {
uint16_t *data = (uint16_t *)frame->data[p];
int lz = frame->linesize[p] / sizeof(uint16_t);
int width = s->planewidth[p];
int left = s->borders[p].left;
int right = s->borders[p].right;
int height = s->planeheight[p];
int height2 = height * lz;
int top = s->borders[p].top;
int top2 = top * lz;
int bottom = height - s->borders[p].bottom;
int bottom2 = bottom * lz;
/* fill left and right borders from top to bottom border */
/********* Here is the additional code line: **********/
if (left > 0 || right > 0) // in case skip for performance
/******************************************************/
for (int y = top2; y < bottom2; y += lz) {
for (int x = 0; x < left; x++)
data[y + x] = data[y + left * 2 - 1 - x];
for (int x = 0; x < right; x++)
data[y + width - right + x] = data[y + width - right - 1 - x];
}
/* fill top and bottom borders */
for (int y = 0; y < top2; y += lz)
memcpy(data + y, data + (top2 * 2 - lz - y), width * sizeof(uint16_t));
for (int y = 0; y < height2 - bottom2; y += lz)
memcpy(data + (bottom2 + y),
data + (bottom2 - lz - y), width * sizeof(uint16_t));
}
}
在类似的函数中,我使用相同的技巧来避免y
在if (left > 0 || right < width)
上的无用循环
在这种情况下,附加的if
子句仅消耗〜0.5%的预期值。这里的代码:
static void smear_borders16(FillBordersContext *s, AVFrame *frame)
{
for (int p = 0; p < s->nb_planes; p++) {
uint16_t *data = (uint16_t *)frame->data[p];
int lz = frame->linesize[p] / sizeof(uint16_t);
int width = s->planewidth[p];
int left = s->borders[p].left;
int right = width - s->borders[p].right;
int height = s->planeheight[p];
int height2 = height * lz;
int top = s->borders[p].top;
int top2 = top * lz;
int bottom = height - s->borders[p].bottom;
int bottom2 = bottom * lz;
/* fill left and right borders from top to bottom border */
if (left > 0 || right < width) // in case skip for performance
for (int y = top2; y < bottom2; y += lz) {
for (int x = 0; x < left; x++)
data[y + x] = data[y + left];
for (int x = right; x < width; x++)
data[y + x] = data[y + right - 1];
}
/* fill top and bottom borders */
for (int y = 0; y < top2; y += lz)
memcpy(data + y, data + top2, width * sizeof(uint16_t));
for (int y = bottom2; y < height2; y += lz)
memcpy(data + y, data + (bottom2 - lz), width * sizeof(uint16_t));
}
}
我的处理器是Intel P8600。希望在此处找到MCVE:https://translate.google.com/translate?sl=de&tl=en&u=forum.ubuntuusers.de%2Fpost%2F9064193如果您不理解德语翻译的说明,请发表评论。
答案 0 :(得分:0)
我已经研究过反汇编生成的机器代码。插入if (left > 0 || right > 0)
会导致编译器对后续代码进行重大更改。看来,在第二种情况下,编译器优化了次优条件,这可以解释20%的性能下降。
184 /* fill left and right borders from top to bottom border */
185
186 for (int y = top2; y < bottom2; y += lz) {
0x00000000002004ca <+138>: cmp %r14d,%esi
0x00000000002004d1 <+145>: jge 0x2005ad <mirror_borders16+365>
0x00000000002004d7 <+151>: movslq %ebx,%rax
0x00000000002004da <+154>: lea -0x2(%rbp),%r15
0x00000000002004de <+158>: lea -0x1(%r11),%r12d
0x00000000002004e2 <+162>: lea (%rax,%rax,1),%r8
0x00000000002004e6 <+166>: movslq %esi,%rax
0x000000000020050e <+206>: mov $0x1,%r12d
0x0000000000200514 <+212>: mov %ebx,%r9d
0x0000000000200517 <+215>: mov %rbx,0x30(%rsp)
0x000000000020051c <+220>: sub %rax,%r15
0x000000000020051f <+223>: sub %edx,%r12d
0x0000000000200522 <+226>: mov %r14d,%ebx
0x0000000000200525 <+229>: nopl (%rax)
0x00000000002005a0 <+352>: lea (%r12,%rsi,1),%eax
0x00000000002005a4 <+356>: cmp %eax,%ebx
0x00000000002005a6 <+358>: jg 0x200528 <mirror_borders16+232>
0x00000000002005a8 <+360>: mov 0x30(%rsp),%rbx
187 for (int x = 0; x < left; x++)
0x0000000000200528 <+232>: test %r11d,%r11d
0x000000000020052b <+235>: jle 0x20055f <mirror_borders16+287>
0x000000000020052d <+237>: movslq %esi,%r14
0x0000000000200530 <+240>: mov %rdi,%rdx
0x0000000000200533 <+243>: mov %ecx,(%rsp)
0x0000000000200536 <+246>: add %r14,%r14
0x0000000000200539 <+249>: lea 0x0(%rbp,%r14,1),%rax
0x000000000020053e <+254>: add %r13,%r14
0x0000000000200541 <+257>: nopl 0x0(%rax)
0x0000000000200557 <+279>: cmp %rax,%r14
0x000000000020055a <+282>: jne 0x200548 <mirror_borders16+264>
0x000000000020055c <+284>: mov (%rsp),%ecx
188 data[y + x] = data[y + left * 2 - 1 - x];
0x00000000002004e9 <+169>: lea (%r11,%r11,1),%edx
0x00000000002004ed <+173>: sub $0x1,%ecx
0x00000000002004f0 <+176>: lea 0x0(%rbp,%rax,2),%rdi
0x00000000002004f5 <+181>: lea -0x1(%r10),%eax
0x00000000002004f9 <+185>: add %r12,%r12
0x00000000002004fc <+188>: mov %r15,%r13
0x00000000002004ff <+191>: sub %r10d,%ecx
0x0000000000200502 <+194>: add %esi,%ecx
0x0000000000200504 <+196>: add %rax,%rax
0x0000000000200507 <+199>: sub %r12,%r13
0x000000000020050a <+202>: lea -0x1(%rsi,%rdx,1),%esi
0x0000000000200548 <+264>: movzwl (%rax),%ecx
0x000000000020054b <+267>: sub $0x2,%rax
0x000000000020054f <+271>: add $0x2,%rdx
0x0000000000200553 <+275>: mov %cx,-0x2(%rdx)
189 for (int x = 0; x < right; x++)
0x000000000020055f <+287>: test %r10d,%r10d
0x0000000000200562 <+290>: jle 0x200597 <mirror_borders16+343>
0x0000000000200564 <+292>: lea 0x1(%rcx),%edx
0x0000000000200567 <+295>: movslq %ecx,%r14
0x000000000020056a <+298>: mov %ecx,(%rsp)
0x000000000020056d <+301>: add %r14,%r14
0x0000000000200570 <+304>: movslq %edx,%rdx
0x0000000000200573 <+307>: lea 0x0(%rbp,%r14,1),%rax
0x0000000000200578 <+312>: add %r15,%r14
0x000000000020057b <+315>: lea 0x0(%rbp,%rdx,2),%rdx
0x000000000020058f <+335>: cmp %rax,%r14
0x0000000000200592 <+338>: jne 0x200580 <mirror_borders16+320>
0x0000000000200594 <+340>: mov (%rsp),%ecx
0x0000000000200597 <+343>: add %r9d,%esi
0x000000000020059a <+346>: add %r9d,%ecx
0x000000000020059d <+349>: add %r8,%rdi
190 data[y + width - right + x] = data[y + width - right - 1 - x];
0x0000000000200580 <+320>: movzwl (%rax),%ecx
0x0000000000200583 <+323>: sub $0x2,%rax
0x0000000000200587 <+327>: add $0x2,%rdx
0x000000000020058b <+331>: mov %cx,-0x2(%rdx)
191 }
跳过无用的循环:
184 /* fill left and right borders from top to bottom border */
185 if (left > 0 || right > 0) // in case skip for performance
0x00000000002004f7 <+135>: test %r8d,%r8d
0x00000000002004fe <+142>: jg 0x200640 <mirror_borders16+464>
0x0000000000200504 <+148>: test %ecx,%ecx
0x0000000000200506 <+150>: jg 0x200640 <mirror_borders16+464>
186 for (int y = top2; y < bottom2; y += lz) {
0x0000000000200640 <+464>: cmp 0x24(%rsp),%r15d
0x0000000000200645 <+469>: jge 0x20050c <mirror_borders16+156>
0x000000000020064b <+475>: mov 0x20(%rsp),%ebp
0x0000000000200661 <+497>: mov %r15d,0x4c(%rsp)
0x0000000000200666 <+502>: sub %ecx,%r9d
0x0000000000200669 <+505>: lea -0x1(%rax,%r15,1),%esi
0x000000000020066e <+510>: mov 0x24(%rsp),%r15d
0x0000000000200673 <+515>: sub %eax,%ebp
0x0000000000200675 <+517>: lea (%r11,%rdx,2),%rdi
0x0000000000200679 <+521>: lea -0x1(%r8),%edx
0x000000000020067d <+525>: mov %ebp,%r10d
0x0000000000200680 <+528>: add %r9d,%ebp
0x0000000000200683 <+531>: lea -0x2(%r11),%r9
0x0000000000200687 <+535>: movslq %ebx,%r13
0x000000000020068a <+538>: add %rdx,%rdx
0x000000000020068d <+541>: mov %ebx,%r12d
0x0000000000200690 <+544>: mov %r9,%r14
0x0000000000200693 <+547>: mov $0x1,%r9d
0x0000000000200699 <+553>: add %r13,%r13
0x000000000020069c <+556>: sub %ecx,%r10d
0x000000000020069f <+559>: sub %rdx,%r14
0x00000000002006a2 <+562>: sub %eax,%r9d
0x00000000002006a5 <+565>: mov %rbx,0x38(%rsp)
0x00000000002006aa <+570>: nopw 0x0(%rax,%rax,1)
0x000000000020072d <+701>: lea (%r9,%rsi,1),%eax
0x0000000000200731 <+705>: cmp %eax,%r15d
0x0000000000200734 <+708>: jg 0x2006b0 <mirror_borders16+576>
0x000000000020073a <+714>: mov 0x38(%rsp),%rbx
0x000000000020073f <+719>: mov 0x4c(%rsp),%r15d
0x0000000000200744 <+724>: jmpq 0x20050c <mirror_borders16+156>
0x0000000000200749 <+729>: repz retq
0x000000000020074b: nopl 0x0(%rax,%rax,1)
187 for (int x = 0; x < left; x++)
0x00000000002006b0 <+576>: test %r8d,%r8d
0x00000000002006b3 <+579>: jle 0x2006ec <mirror_borders16+636>
0x00000000002006b5 <+581>: movslq %esi,%rbx
0x00000000002006b8 <+584>: mov %rdi,%rdx
0x00000000002006bb <+587>: mov %ecx,0x8(%rsp)
0x00000000002006bf <+591>: add %rbx,%rbx
0x00000000002006c2 <+594>: lea (%r11,%rbx,1),%rax
0x00000000002006c6 <+598>: add %r14,%rbx
0x00000000002006c9 <+601>: nopl 0x0(%rax)
0x00000000002006df <+623>: cmp %rax,%rbx
0x00000000002006e2 <+626>: jne 0x2006d0 <mirror_borders16+608>
0x00000000002006e4 <+628>: mov 0x8(%rsp),%ecx
0x00000000002006f0 <+640>: mov %esi,0x8(%rsp)
0x00000000002006f4 <+644>: cltq
0x00000000002006f6 <+646>: lea (%r11,%rax,2),%rdx
0x00000000002006fa <+650>: lea 0x0(%rbp,%rsi,1),%eax
0x00000000002006fe <+654>: cltq
0x0000000000200700 <+656>: lea (%r11,%rax,2),%rbx
0x0000000000200704 <+660>: xor %eax,%eax
0x0000000000200706 <+662>: nopw %cs:0x0(%rax,%rax,1)
188 data[y + x] = data[y + left * 2 - 1 - x];
0x000000000020064f <+479>: lea (%r8,%r8,1),%eax
0x0000000000200653 <+483>: mov 0x18(%rsp),%r11
0x0000000000200658 <+488>: mov $0x1,%r9d
0x000000000020065e <+494>: movslq %r15d,%rdx
0x00000000002006d0 <+608>: movzwl (%rax),%ecx
0x00000000002006d3 <+611>: sub $0x2,%rax
0x00000000002006d7 <+615>: add $0x2,%rdx
0x00000000002006db <+619>: mov %cx,-0x2(%rdx)
189 for (int x = 0; x < right; x++)
0x00000000002006e8 <+632>: test %ecx,%ecx
0x00000000002006ea <+634>: jle 0x200727 <mirror_borders16+695>
0x00000000002006ec <+636>: lea (%r10,%rsi,1),%eax
0x000000000020071f <+687>: cmp %eax,%ecx
0x0000000000200721 <+689>: jg 0x200710 <mirror_borders16+672>
0x0000000000200723 <+691>: mov 0x8(%rsp),%esi
0x0000000000200727 <+695>: add %r12d,%esi
0x000000000020072a <+698>: add %r13,%rdi
190 data[y + width - right + x] = data[y + width - right - 1 - x];
0x0000000000200710 <+672>: movzwl (%rdx),%esi
0x0000000000200713 <+675>: sub $0x2,%rdx
0x0000000000200717 <+679>: mov %si,(%rbx,%rax,2)
0x000000000020071b <+683>: add $0x1,%rax
191 }