Question

嗯，ideone显示速度较慢。在我的VM（ubuntu狼人）中的g ++ 4.9.3中，我得到A：830 B：460。在任何一种情况下，为什么一个比另一个更快？我假设A是按参考推动而B是按值计算，但我不明白为什么会这样做。我尝试使用operator<(A a, ...但没有帮助。

struct A {
    u32 first, second;
    A(int f, int s):first(f),second(s) {}
    bool operator<(u32 b) const { return first < b; }
};
//bool operator<(const A&a, u32 b) { return a.first < b; }

struct B {
    u64 v;
    B(int f, int s) { v = ((long)f << 32) | s; }
    bool operator<(u32 b) const {
        return (v >> 32) < b;
    }
};
u32 get_value(deque<A>& d, u32 v) {
    auto p = lower_bound(d.begin(), d.end(), v);
    if (p != d.end())
        return p->second;
    else
        return UINT_MAX;
}
u32 get_value(deque<B>& d, u32 v) {
    auto p = lower_bound(d.begin(), d.end(), v);
    if (p != d.end())
        return p->v & 0xFFFFFFFF;
    else
        return UINT_MAX;
}

int main(int argc, char *argv[]) {

    {
        deque<A> d;
        struct timeval s, e;
        gettimeofday(&s, 0);
        for (int i = 0; i < 1024LL * 1024 * 1024 * 3 / 32; ++i)
            d.emplace_back(A(i, i ^ 92142));
        long v = 0;
        for (int i = 0; i < 10000; ++i)
            v += get_value(d, i * 3 + 1);

        gettimeofday(&e, 0);
        auto sec = e.tv_sec - s.tv_sec;
        auto usc = e.tv_usec - s.tv_usec;
        printf("A %ld\n", v);
        printf("Time: %lu\n", sec * 1000 + usc / 1000);
    }
    {
        deque<B> d;
        struct timeval s, e;
        gettimeofday(&s, 0);
        for (int i = 0; i < 1024LL * 1024 * 1024 * 3 / 32; ++i)
            d.emplace_back(B(i, i ^ 92142));
        long v = 0;
        for (int i = 0; i < 10000; ++i)
            v += get_value(d, i * 3 + 1);

        gettimeofday(&e, 0);
        auto sec = e.tv_sec - s.tv_sec;
        auto usc = e.tv_usec - s.tv_usec;
        printf("A %ld\n", v);
        printf("Time: %lu\n", sec * 1000 + usc / 1000);
    }
}

Answer 1

我认为你正在使用struct B { u64 v; }获得加速，因为你正在使用它与编译时常量args。编译器可以将这两个值组合成64位存储。

我创建了一个简单的非成员函数来获取针对A和B的operater<的非内联案例生成的asm输出。作为godbolt shows，struct A::operator<的asm显着更高效。 B ::运营商LT;确实做了64位负载，然后转移。为了不熟悉x86的人的利益，我评论了asm。

bool comp_A(const struct A &lhs, u32 rhs) { return lhs < rhs; }
bool comp_B(const struct B &lhs, u32 rhs) { return lhs < rhs; }

comp_A(A const&, unsigned int):
    cmpl    %esi, (%rdi)  # compare with lhs->first as a memory operand
    setb    %al
    ret
comp_B(B const&, unsigned int):
    movq    (%rdi), %rax  # load lhs->v
    movl    %esi, %esi    # zero-extend the low 32b of rhs
    shrq    $32, %rax     # shift the high32 of v down to the low 32
    cmpq    %rsi, %rax
    setb    %al           # al=1 if rax "bigger than" rsi, else 0.
    ret

x86具有快速硬件支持，可将任何大小的整数从RAM加载到32或64位临时（在寄存器中）。 movzx和movsx零或符号扩展，并且与mov的常规加载一样便宜。 32位操作总是将dest寄存器的高32归零，因此对旧值的错误依赖不是问题。（它们适用于16位和8位操作，这就是为什么movzx到32位临时是一个好计划的原因。）

我没有看过你实际功能的asm，因为它非常大。不过，我建议一般使用版本A.可能是因为gcc没有用64位移动来复制它，可能是因为它可能没有对齐？ x86不需要对齐（非AVX（legacy-SSE）16byte内存操作数除外）。然而，IIRC只是在大约2008年或2009年的CPU（英特尔的Nehalem）之后，只要它们没有越过缓存线就没有对齐的加载/存储没有任何惩罚。 gcc可能仍然不愿意使用它们，因为潜在的增益小于具有缓慢的未对齐访问的旧CPU的潜在缺点。

你可能会得到gcc，通过联盟给你两全其美。

union A { u64 v; u32 first_last[2]; };

这可能会导致编译器在复制时使用64位移动，但仍然会执行32位A.first_last[0]加载，并且在访问单个字段时无需移动。

Answer 2

使用寄存器大小的变量总是最快的，否则你必须处理可能未对齐的数据和裁剪操作。

我认为这是一个安全的假设，即ideone正在使用64位编译器，因此可以解释这一点。

Answer 3

获得准确的高分辨率计时非常困难。 gettimeofday函数不能保证是微秒级的分辨率，并且可以受到许多其他系统进程（ntpd，多线程等）的影响。如果您使用支持读取时钟寄存器的计算机（如英特尔处理器），可以使用滴答计时器。

使用你的代码和滴答计时器我注意到最快的时间从一个结构转移到另一个结构（我没有在VM上运行）。

#include <cstdio>
#include <deque>
#include <algorithm>
#include <climits>
#include <sys/time.h>
#include <stdint.h>
using namespace std;

typedef unsigned int u32;
typedef unsigned long long u64;
static_assert(sizeof(u32)==4 && sizeof(u64)==8, "Fail");

#define CLOCK_TICKS 2400000000  // my machine clock is 2.4 Ghz

struct A {
    u32 first, second;
    A(int f, int s):first(f),second(s) {}
    bool operator<(u32 b) const { return first < b; }
};
//bool operator<(const A&a, u32 b) { return a.first < b; }

struct B {
    u64 v;
    B(int f, int s) { v = ((u64)f << 32) | s; }
    bool operator<(u32 b) const {
    return (v >> 32) < b;
    }
};
u32 get_value(deque<A>& d, u32 v) {
    auto p = lower_bound(d.begin(), d.end(), v);
    if (p != d.end())
    return p->second;
else
    return UINT_MAX;
}
u32 get_value(deque<B>& d, u32 v) {
    auto p = lower_bound(d.begin(), d.end(), v);
    if (p != d.end())
        return p->v & 0xFFFFFFFF;
    else
        return UINT_MAX;
}

// This function was originally published in the Intel hardware manual
// for the first chips that supported the SSE instruction set.  I've
// seen variations on this for 12+ years and do not know where to
// attribute its origin

 inline uint64_t rdtsc() {
    uint32_t lo, hi;
    __asm__ __volatile__ (
      "xorl %%eax, %%eax\n"
      "cpuid\n"
      "rdtsc\n"
      : "=a" (lo), "=d" (hi)
      :
      : "%ebx", "%ecx");
    return (uint64_t)hi << 32 | lo;
}

int main(int argc, char **argv)
{
    uint64_t s, e;
    double elapsed;
    s = rdtsc();   // warm up the timer
    {
        deque<A> d;

        s = rdtsc();
        for (int i = 0; i < 1024LL * 1024 * 1024 * 3 / 32; ++i)
            d.emplace_back(A(i, i ^ 92142));
        long v = 0;
        for (int i = 0; i < 10000; ++i)
            v += get_value(d, i * 3 + 1);

        e = rdtsc();
        elapsed = ((double)e - (double)s) / (double)CLOCK_TICKS;
        printf("A %ld\n", v);
        printf("Time: %lf\n", elapsed);
    }
    {
        deque<B> d;
        s = rdtsc();
        for (int i = 0; i < 1024LL * 1024 * 1024 * 3 / 32; ++i)
            d.emplace_back(B(i, i ^ 92142));
        long v = 0;
        for (int i = 0; i < 10000; ++i)
            v += get_value(d, i * 3 + 1);

        e = rdtsc();

        printf("B %ld\n", v);
        elapsed = ((double)e - (double)s) / (double)CLOCK_TICKS;
        printf("Time: %lf\n", elapsed);
    }   
}

为什么struct {u64}比struct {u32，u32}更快？

3 个答案: