Question

下面两个代码块计算两个整数之间的汉明距离。它们是相同的，但是为什么第一个比LeetCode的第二个要快？

快速：

int hammingDistance(int x, int y) {
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += (x&1) ^ (y&1);
        x >>= 1;
        y >>= 1;
    }
    return count;
}

慢：

int hammingDistance(int x, int y) {
    int n = x ^ y;
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += n & 1;
        n >>= 1;
    }
    return count;
}

**** 更新 ****

我已经在Mac机器上编写了一个测试代码：

#include <time.h>
#include "cstdio"

int hamm_fast(int x, int y) {
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += (x&1) ^ (y&1);
        x >>= 1;
        y >>= 1;
    }
    return count;
}

int hamm_slow(int x, int y) {
    int n = x ^ y;
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += n & 1;
        n >>= 1;
    }
    return count;
}

int main()
{
    clock_t begin;
    clock_t end;
    double time_spent;

    // benchmark fast
    begin = clock();
    for (int i = 0; i < 100000; ++i)
        hamm_fast(100,100);
    end = clock();
    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Fast: %f ms\n", time_spent*1e3);

    // benchmark slow
    begin = clock();
    for (int i = 0; i < 100000; ++i)
        hamm_slow(100,100);
    end = clock();
    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Slow: %f ms\n", time_spent*1e3);

    return 0;
}

编译并运行为

g++ ham.cpp && ./a.out

这两个解决方案变得相似。例如：

Fast: 7.233000 ms
Slow: 6.963000 ms

实际上，速度较慢的速度更快...

Answer 1

正如我在comment中指出的那样，您需要谨慎对待基准测试，以确保积极的优化器无法优化函数调用。

积极的优化器可能会删除测试循环，因为您在每次迭代中将相同的值传递给汉明距离函数，而忽略了返回的值，这将使您无计可施。

如果是我的测试，我将有一个包含计时和函数调用的测试功能。该函数调用将在两个循环内进行，我将两个循环索引作为该函数的参数传递，并对返回值求和，并在最后打印总和，以帮助确保函数产生相同的结果。我还要使它运行几秒钟，而不是毫秒。

这是我的代码。它使用我在GitHub上的SOQ（堆栈溢出问题）存储库中提供的计时代码，作为src/libsoq子目录中的文件timer.c和timer.h。

#include "timer.h"
#include <stdio.h>

#define L1_MIN 0
#define L1_MAX 10240
#define L2_MIN 0
#define L2_MAX 10240

static int hamm_fast(int x, int y)
{
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += (x & 1) ^ (y & 1);
        x >>= 1;
        y >>= 1;
    }
    return count;
}

static int hamm_slow(int x, int y)
{
    int n = x ^ y;
    int count = 0;
    for (int i = 0; i < 32; ++i)
    {
        count += n & 1;
        n >>= 1;
    }
    return count;
}

static void tester(const char *tag, int (*function)(int x, int y))
{
    Clock t;
    clk_init(&t);

    clk_start(&t);
    int sum = 0;
    for (int i = L1_MIN; i < L1_MAX; i++)
    {
        for (int j = L2_MIN; j < L2_MAX; j++)
            sum += (*function)(i, j);
    }
    clk_stop(&t);

    char buffer[32];
    int iterations = (L1_MAX - L1_MIN) * (L2_MAX - L2_MIN);
    printf("%s sum = %d (%d iterations) %s\n", tag, sum, iterations,
            clk_elapsed_us(&t, buffer, sizeof(buffer)));
}

int main(void)
{
    for (int i = 0; i < 10; i++)
    {
        tester("Fast", hamm_fast);
        tester("Slow", hamm_slow);
    }
    return 0;
}

我一次运行的输出是：

Fast sum = 710934528 (104857600 iterations) 2.461100
Slow sum = 710934528 (104857600 iterations) 1.181584
Fast sum = 710934528 (104857600 iterations) 2.480401
Slow sum = 710934528 (104857600 iterations) 1.182961
Fast sum = 710934528 (104857600 iterations) 2.466685
Slow sum = 710934528 (104857600 iterations) 1.197394
Fast sum = 710934528 (104857600 iterations) 2.435806
Slow sum = 710934528 (104857600 iterations) 1.175533
Fast sum = 710934528 (104857600 iterations) 2.384162
Slow sum = 710934528 (104857600 iterations) 1.184161
Fast sum = 710934528 (104857600 iterations) 2.376042
Slow sum = 710934528 (104857600 iterations) 1.191555
Fast sum = 710934528 (104857600 iterations) 2.389027
Slow sum = 710934528 (104857600 iterations) 1.169186
Fast sum = 710934528 (104857600 iterations) 2.393707
Slow sum = 710934528 (104857600 iterations) 1.209600
Fast sum = 710934528 (104857600 iterations) 2.423526
Slow sum = 710934528 (104857600 iterations) 1.204585
Fast sum = 710934528 (104857600 iterations) 2.515968
Slow sum = 710934528 (104857600 iterations) 1.196783

如您所见，“快速”代码的速度大约是“慢速”代码的两倍。这主要是因为“快速”代码在每个循环中比“慢”代码执行更多的操作。与“慢”代码中的1 &和1 ^相比，“快速”代码执行2 >>=操作，1 &操作和2 >>=操作。码。但是结果显然是一样的。那是个好消息。这些功能在结果上是等效的，但在速度上是等效的。

使用GCC 8.2.0在运行macOS 10.13.6 High Sierra的MacBook Pro上进行编译。

编译命令行（源文件spot79.c）：

$ gcc -O3 -g -I./inc -std=c11 -Wall -Wextra -Werror -Wmissing-prototypes \
>     -Wstrict-prototypes spot79.c -o spot79 -L./lib -lsoq
$

timer.h标头位于./inc目录中，而soq库位于./lib中—这只是我的构建设置。

逐位运算的速度差

1 个答案: