Question

我认为内存访问比使用alpha混合完成的乘法和除法（尽管编译器优化）更快。但它没有预期的那么快。

在这种情况下，用于表的16兆字节不是问题。但是如果表查找甚至比执行所有CPU计算都慢，那就是一个问题。

任何人都可以向我解释为什么以及发生了什么？表查找是否会用较慢的CPU击败？

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <time.h>

#define COLOR_MAX UCHAR_MAX

typedef unsigned char color;

color (*blending_table)[COLOR_MAX + 1][COLOR_MAX + 1];

static color blend(unsigned int destination, unsigned int source, unsigned int a) {
    return (source * a + destination * (COLOR_MAX - a)) / COLOR_MAX;
}

void initialize_blending_table(void) {
    int destination, source, a;

    blending_table = malloc((COLOR_MAX + 1) * sizeof *blending_table);
    for (destination = 0; destination <= COLOR_MAX; ++destination) {
        for (source = 0; source <= COLOR_MAX; ++source) {
            for (a = 0; a <= COLOR_MAX; ++a) {
                blending_table[destination][source][a] = blend(destination, source, a);
            }
        }
    }
}

struct timer {
    double start;
    double end;
};

void timer_start(struct timer *self) {
    self->start = clock();
}

void timer_end(struct timer *self) {
    self->end = clock();
}

double timer_measure_in_seconds(struct timer *self) {
    return (self->end - self->start) / CLOCKS_PER_SEC;
}

#define n 300

int main(void) {
    struct timer timer;
    volatile int i, j, k, l, m;

    timer_start(&timer);
    initialize_blending_table();
    timer_end(&timer);
    printf("init %f\n", timer_measure_in_seconds(&timer));

    timer_start(&timer);
    for (i = 0; i <= n; ++i) {
        for (j = 0; j <= COLOR_MAX; ++j) {
            for (k = 0; k <= COLOR_MAX; ++k) {
                for (l = 0; l <= COLOR_MAX; ++l) {
                    m = blending_table[j][k][l];
                }
            }
        }
    }
    timer_end(&timer);
    printf("table %f\n", timer_measure_in_seconds(&timer));

    timer_start(&timer);
    for (i = 0; i <= n; ++i) {
        for (j = 0; j <= COLOR_MAX; ++j) {
            for (k = 0; k <= COLOR_MAX; ++k) {
                for (l = 0; l <= COLOR_MAX; ++l) {
                    m = blend(j, k, l);
                }
            }
        }
    }
    timer_end(&timer);
    printf("function %f\n", timer_measure_in_seconds(&timer));

    return EXIT_SUCCESS;
}

结果

$ gcc test.c -O3
$ ./a.out
init 0.034328
table 14.176643
function 14.183924

Answer 1

表查找不是灵丹妙药。当桌子足够小时它会有所帮助，但在你的情况下桌子非常大。你写了

在这种情况下，用于表的16兆字节不是问题

我认为这是非常错误的，可能是您遇到的问题的根源。 16兆字节对于L1缓存来说太大了，因此从表中随机索引读取数据将涉及较慢的缓存（L2，L3等）。缓存未命中的代价通常很大;如果你希望你的LUT解决方案更快，你的混合算法必须非常复杂。

阅读Wikipedia article了解详情。

Answer 2

你的基准测试无可救药地破坏了，它使得LUT看起来比它实际上好很多，因为它按顺序读取了表。

如果您的性能结果显示LUT比直接计算更差，那么当您从实际的随机访问模式和缓存未命中开始时，LUT将会更糟糕。

专注于改进计算，并实现矢量化。它可能比基于表格的方法获得更好的回报。

(source * a + destination * (COLOR_MAX - a)) / COLOR_MAX

重新排列成为

(source * a + destination * COLOR_MAX - destination * a) / COLOR_MAX

简化为

destination + (source - destination) * a / COLOR_MAX

有一个乘法和一个除以常数，两者都非常有效。它很容易被矢量化。

您还应该将辅助函数标记为inline，尽管优秀的编译器可能无论如何都要将其编入内容。

使用表查找进行Alpha混合的速度不如预期的那么快

2 个答案: