Question

我需要提高以下代码的性能（Intel Ivy Bridge，x64）：

unsigned int delta;
unsigned int a[100];
unsigned int b[100];

...

double sum = 0;

for(int i = 0; i < 100; i++)
    sum += (double)b[i]/a[i];

bool possible = delta >= sum;

瓶颈确实是double，并使执行时间增加了3倍。 a[index]将是0到500米之间的任何值。 b[index]将为0到500。

问：在对这段代码的两次调用之间如何修改数组a和b？

在每次通话中，仅差异为a[index]++;，其中0 <= index < 100 b 始终相同。 delta也不会改变。

由于结果与另一个数字进行比较并存储为布尔值，我绝对需要尽可能高的精度。这就是为什么我使用双倍而不是浮动的原因。如你所知，即使1 / 1b的差异也会返回错误的值，因为结果是布尔值！

Answer 1

一件事：

将英特尔程序集硬编码到您的程序中会降低其可移植性，更加脆弱，安全性降低，并且通常会让人感到恐惧。这是一项需要避免的任务，除非您需要从裸机中获得最后一盎司的性能，例如编写内核级代码（驱动程序和调度程序）。这可能不适合它。

事情二：

除非你喜欢上帝，否则你可能无法编写比现有代码更快的程序集。 C ++包含深刻的魔力，许多例程操作都编译成违反直觉的优化，这些优化比天真的解决方案更有效。

三件事：

装配不是你的问题。 double代表双精度浮点数，浮点运算通常比整数运算更昂贵，而且这个瓶颈是计算所固有的。

Answer 2

通过考虑算法更改，可以最好地解决问题。

您声明更新数组的简单解决方案（减去旧数据并添加新值）是不可接受的，因为精度至关重要。该解决方案具有时间复杂度O(u)，其中u是数组更新的数量，空间复杂度为O(1)。

到目前为止，所有解决方案都依赖于对整个数组进行重新求和，而每次迭代中只有一个条目发生变化。这是时间复杂度O(un)和空间复杂度O(1)。

但显而易见的解决方案是仅重新排列变化的阵列的“部分”！当一个元素在你的数组中更新时，只有一半的数组发生了变化，只有一半的数据发生了变化，只有那一半的一半发生了变化......

我的解决方案是保留每个子数组之和的完整树。在每次更新时，我都会从叶子中传播已更改的总和，重新使用之前在未更改的子树上完成的子阵列的所有求和。这是O(u log n)时间复杂度，代价为O(n)空间复杂度。

代码

#include <stdio.h>
#include <time.h>

/**
 * Controlling variables.
 */

#ifndef REPEAT
#define REPEAT         2260
#endif
#ifndef NAIVE
#define NAIVE          0
#endif
#ifndef PROBLEM
#define PROBLEM        (1<<7)
#endif
#ifndef PRINT_PROGRESS
#define PRINT_PROGRESS 1
#endif


/**
 * Initialize the workspace, returning the initial sum.
 */

static double speedyInit(unsigned* a, unsigned* b, unsigned n, double* w, unsigned N){
    unsigned i, j;
    double*  adbl = w+  N;
    double*  bdbl = w+2*N;

    /**
     * We initialize the workspace with the correct values out to index i
     * and zero(-producing) values from index n to N.
     */

    for(i=0;i<n;i++){
        adbl[i] = a[i];
        bdbl[i] = b[i];
        w[i]    = bdbl[i]/adbl[i];
    }
    for(;i<N;i++){
        adbl[i] = 1.0;
        bdbl[i] = 0.0;
        w[i]    = 0.0;
    }

    /**
     * We in-place and bottom-up construct the "tree" of sums.
     */

    for(i=j=N;i>1;i-=2){/* First-level sums */
        w[--j] = w[i-2] + w[i-1];
    }
    while(--j){/* Subsequent sum levels */
        w[j] = w[2*j] + w[2*j+1];
    }

    /**
     * We return the overall sum, found in w[1].
     */

    return w[1];
}


/**
 * Performs the "A" array update efficiently, returning the new sum.
 */

static double speedyUpd(unsigned* a, double* w, unsigned N, unsigned i){
    unsigned p;
    double   v0, v1;
    double*  adbl = w+  N;
    double*  bdbl = w+2*N;

    /**
     * We increment the two "a" arrays.
     * 
     * NOTE: A double's precision is great enough to losslessly store
     * 32-bit unsigned values.
     */

    a   [i]++;
    adbl[i]++;

    /**
     * We compute the new value at index i and, somewhat wastefully, its "buddy"
     * value at index i^1.
     */

    v0 = bdbl[i  ]/adbl[i  ];
    v1 = bdbl[i^1]/adbl[i^1];

    /**
     * We iteratively propagate the v0+v1 sum "up" the top of the "tree" in log-time.
     * 
     * On each iteration we insert the sum v0+v1 at index p, then set v0 to the
     * value at index p and v1 to the value of its "buddy", index p^1. The parent
     * index of p is then computed and stored in p. 
     */

    p    = (N>>1) + (i>>1);
    while(p){
        v0  =  w[p  ]  =  v0+v1;
        v1  =  w[p^1];
        p  >>= 1;
    }

    /**
     * We return the overall sum, found in w[1].
     */

    return w[1];
}


/**
 * Performs the "A" array update inefficiently, returning the new sum.
 */

static double slowyUpd(unsigned* a, double* w, unsigned N, unsigned i){
    double   sum  = 0;
    double*  adbl = w+  N;
    double*  bdbl = w+2*N;

    a   [i]++;
    adbl[i]++;

    for(i=0; i<N; i++){
        sum += bdbl[i]/adbl[i];
    }

    return sum;
}


/**
 * Requires N a power of two bigger than one.
 * Requires n <= N.
 * Requires workspace w of 3*N doubles.
 */


double speedy(unsigned* a, unsigned* b, unsigned n, double* w, unsigned N){
    int    i = n, cond = 1;
    double sum;
    double delta = 0;


    sum = speedyInit(a, b, n, w, N);


    while(cond){
        /* Do whatever */
        /* ... */
        /* Set i. */
        i = i-1;
        /* ... */
        #if NAIVE
        sum = slowyUpd(a, w, N, rand()%n);
        #else
        sum = speedyUpd(a, w, N, rand()%n);
        #endif
        /* ... */
        int possible = delta >= sum;
        /* ... */
        cond = i > 0;
    }

    return sum;
}

/**
 * Main. Gives example.
 */

int main(void){
    const unsigned n=PROBLEM, N=PROBLEM;
    unsigned a[n], b[n];
    double   w[3*N];
    unsigned i, j;
    double   dummy = 0;

    for(i=0;i<n;i++){
        a[i] = 1;
        b[i] = i;
    }

    speedy(a, b, n, w, N);/* Dummy */

    clock_t clk = -clock();
    for(i=0;i<REPEAT;i++){
        dummy += speedy(a, b, n, w, N);

        #if PRINT_PROGRESS
        putchar('.');
        fflush(stdout);
        #endif
    }
    clk += clock();

    #if PRINT_PROGRESS
    putchar('\n');
    #endif

    printf("dummy = %f, average time %.9f\n", dummy, clk/((double)CLOCKS_PER_SEC*REPEAT));
}

用法

假设你把它放在名为upd_avg.c的文件中，命令

gcc -O3 upd_avg.c -o upd_avg -DPRINT_PROGRESS=0 -DNAIVE=0 -DREPEAT=2260 -DPROBLEM=128
gcc -O3 upd_avg.c -o upd_avg -DPRINT_PROGRESS=0 -DNAIVE=1 -DREPEAT=2260 -DPROBLEM=128

将分别编译我的O（u log n）算法和其他所有人的天真O（un）算法。

结果

对于u与n相同的情况，差异与日期（或mergesort与bubblesort）一样明确：

                 |         Average time/run (s)
     Size        |    -DNAIVE=0     |    -DNAIVE=1
_________________|_____________________________________
 -DPROBLEM=2     |   0.000000094    |   0.000000071
 -DPROBLEM=4     |   0.000000196    |   0.000000180
 -DPROBLEM=8     |   0.000000482    |   0.000000809
 -DPROBLEM=16    |   0.000000989    |   0.000002556
 ...             |   ...            |   ...
 -DPROBLEM=128   |   0.000007623    |   0.000150181
 -DPROBLEM=256   |   0.000016713    |   0.000590156
 -DPROBLEM=512   |   0.000037765    |   0.002338671
 -DPROBLEM=1024  |   0.000077752    |   0.009324281
 -DPROBLEM=2048  |   0.000167924    |   0.037225660
 -DPROBLEM=4096  |   0.000343608    |   0.146875721 (*)
 ...             |   ...            |   ...
 -DPROBLEM=65536 |   0.007426288    |  21.264978500 (**)
 ...             |   ...            |  We haaaveee liiiffffttttooofffff!!!!!!!

（*）-DREPEAT=226而不是2260。（**）-DREPEAT=2而不是2260，CPU风扇速度加倍。

塔内

我的speedy()函数接受任意大小unsigned int的{{1}}数组a和b。但是，它还需要分配大小为n >= 2的工作空间内存，其中3*N必须是2的幂，最好等于N四舍五入到下一个更高的2的幂。

函数n设置sums树，从而计算初始值，它位于工作空间的根部，定义为元素speedyInit()，以简化实现。

函数w[1]是实现对数时间和传播的函数。它内部的speedyUpd()循环优雅地实现从树叶向上走树。它由while启用。

函数-DNAIVE=0是天真的实现。它由slowyUpd()启用，因为慢而被命名。

备注

N.B。在尝试对我的代码进行基准测试时，我发现GCC的折叠和DCE在删除无效的代码或仅运行一次函数时非常出色。

N.B。我发现精确度至关重要有些奇怪，但是你会依次添加可能大不相同的数量，而不会看到Kahan的求和算法。

Answer 3

以下代码比英特尔酷睿上的原始代码快16倍 i7-3770使用带有“-O3”的Apple LLVM 5.0并且通常更准确（因为它更有可能增加相似数量的数量，从而避免失败浮点加法中的位。）

由于迭代之间只有一个a[i]更改，我们可以缓存所有商。我们还可以将添加内容组织到二叉树和缓存中大部分金额。然后，当一个a[i]更改时，我们只需要更新沿二叉树的单个路径求和。

首先，我们定义一个数组来保存商和它们的总和，我们初始化它：

// Define number of elements in base arrays.
#define N 100

// Define size needed by adding sizes of each level of tree.
#define P   (100+50+26+14+8+4+2+1)

// Define array.
double q[P];

// Initialize first level with quotients.
for (int i = 0; i < N; ++i)
    q[i] = (double) b[i] / a[i];

// For each other level, form sums from two elements of previous level.
for (int b0 = 0, t = N; 1 < t;)
{
    // t is the number of elements in the current level.
    // b0 is the base for the previous level.
    // b1 is the base for the current level.
    int b1 = b0 + t;

    // If t is odd, pad the level with a zero element.
    if (t & 1)
        q[b1++] = 0;

    // Calculate the size of the current level.
    t = (t+1)/2;

    // Calculate each element in the current level from the previous level.
    for (int i = 0; i < t; ++i)
        q[b1+i] = q[b0+2*i+0] + q[b0+2*i+1];

    // Set the base for the next level.
    b0 = b1;
}

每当元素a[i]发生变化时，我们都会更新它的存储商更新树：

double C(unsigned int a[], unsigned int b[], double q[], int i)
{
    // Update the stored quotient.
    q[i] = (double) b[i] / a[i];

    // Update the tree, using code similar to above.
    for (int b0 = 0, t = N; 1 < t;)
    {
        int b1 = b0 + t;
        if (t & 1)
            b1++;
        t = (t+1)/2;

        // Calculate the index for the element to update in this level.
        i /= 2;

        // Update the sum that changes in this level.
        q[b1+i] = q[b0+2*i+0] + q[b0+2*i+1];

        b0 = b1;
    }

    // Return the root.
    return q[P-1];
}

Answer 4

OP的评论表明其原始代码段中的循环会反复运行，但在运行之间，只有a的一个条目发生更改，b的条目没有更改。所以：

unsigned int delta;
unsigned int a[100];
unsigned int b[100];

// ...

double sum = 0;

// run ONLY ONCE
for(int i = 0; i < 100; i++)
    sum += (double)b[i]/a[i];

// ...

// run during successive iterations, when a[index] changes
sum -= (double)b[index]/a[index];
a[index]++;
sum += (double)b[index]/a[index];

bool possible = delta >= sum;

// ...

编辑:(对OP的评论）从OP评论帖子中的对话看来，OP有一个问题比他最初建议的更简单。所以，OP，我真的认为如果你只发布这段代码的目的，你最终会得到一个更好的答案，因为可能有更好的方法来解决你的实际的问题。请参阅https://meta.stackexchange.com/questions/66377/what-is-the-xy-problem。

Answer 5

此代码计算的总和大约是Intel Core i7-3770上原始代码的两倍，使用“-O3”编译Apple LLVM 5.0：

#define L   50

double sum = 0;

double numerator = 0, denominator = 1;

for (int i = 0; i < N; i += L)
{
    for (int j = i; j < i+L && j < N; ++j)
    {
        numerator = numerator * a[j] + denominator * b[j];
        denominator *= a[j];
    }
    sum += numerator / denominator;
    numerator = 0;
    denominator = 1;
}

它的工作原理是避免划分，这是一项耗时的操作。相反，它只是添加分数而不减少它们。

我包含了第二个循环，可用于合并累积分数，如果它们可能变得如此之大以至于它们溢出double范围。在这种情况下，这不是必需的，因为每个a[i]最多为500，并且最多有100个，因此最大累积分母为500 ¹⁰⁰，其在{{1 }} 范围。由于每个double也最多为500，因此累积的分子不能超过（2•500）¹⁰⁰，这也在范围内。

如果涉及其他参数，则可以设置b[i]，以便限制合并之间的迭代次数以防止溢出。

Answer 6

您可以使用较低的精度（例如，定点算术）将算法更改为“高估”总和，并且仅当此高估证明大于delta时，才能以更高的精度进行计算。

Answer 7

你需要双精度，因为你有一个布尔结果？没有这样的规则。我理解你想要“可预测的”结果。

double计算的精度不也是可预测的，所以我真的很想知道你是否不希望恢复到定点运算（例如将所有输入乘以2） ¹⁶左右）。由于您的最小a/b是1/500，而您最大的a/b是500.这意味着动态范围为500 ²，您已经设置好了。您可以做出的最大绝对误差是“足够小”。

可以通过两个小功能完成：

int toFixedPoint(int a, int b) {
    return (a<<16)/b;
}

int fromFixedPoint(int q) {
   return q >> 16;
}

由于你的数组变化非常小，你可能想要重写方程式，以便你可以区别地表达它们：

int nextTotal(previousTotal, changedIndex) {
  // find the changed index i
  return previousTotal + toFixedPoint(1, b[i]);
}

在循环中：

static total = 0;

int i = changedIndex(a);
total = nextTotal(total, i);
if (delta <= fromFixedPoint(total)) {
   ...
}

这会将计算次数减少100倍：）

双重瓶颈，如何改善呢？

7 个答案:

代码

用法

结果

塔内

备注