Question

我目前有以下内容：

#include <cstdint>
#include <memory>

template<typename T>
bool isZeroed(T const & num) {
    void const * ptr = std::addressof(num);
    uint8_t const * pos = static_cast<uint8_t const *>(ptr);
    uint8_t const * const endpos = pos + sizeof(T);
    for (;pos < endpos; ++pos)
    {
      if (*pos != uint8_t(0)) 
        return false;
    }
    return true;
}

int main(int argc, char * argv[])
{
  return isZeroed(static_cast<uint64_t>(argc));
}

使用-O3在gcc 7上生成：

main:
        movsx   rdi, edi
        test    dil, dil
        mov     QWORD PTR [rsp-8], rdi
        jne     .L9
        cmp     BYTE PTR [rsp-7], 0
        jne     .L9
        cmp     BYTE PTR [rsp-6], 0
        jne     .L9
        cmp     BYTE PTR [rsp-5], 0
        jne     .L9
        cmp     BYTE PTR [rsp-4], 0
        jne     .L9
        cmp     BYTE PTR [rsp-3], 0
        jne     .L9
        cmp     BYTE PTR [rsp-2], 0
        jne     .L9
        cmp     BYTE PTR [rsp-1], 0
        sete    al
.L2:
        movzx   eax, al
        ret
.L9:
        xor     eax, eax
        jmp     .L2

请参阅https://godbolt.org/g/HWB3is

在我的脑海中，我认为应该可以将这些BYTE比较折叠成一次占用更多字节的比例，如WORD / DWORD / QWORD。

任何人都知道我在代码中做了什么阻止优化器执行此操作，或者这在gcc中是不可能的？

Answer 1

考虑到Jester的评论，我可以做到以下几点。

#include <cstdint>
#include <memory>

template<typename CmpSizeType>
bool isZeroed(void const * & pos, size_t & bytesLeft) { 
  while (bytesLeft >= sizeof(CmpSizeType))
  {
    CmpSizeType const * posOfSize = static_cast<CmpSizeType const *>(pos);
    if ( *posOfSize != CmpSizeType(0)) return false;
    pos = posOfSize + 1;
    bytesLeft -= sizeof(CmpSizeType);
  }
  return true;
}

template<typename T>
bool isZeroed(T const & num) {  
  size_t bytesLeft = sizeof(T);
  void const * pos = std::addressof(num);
  if(!isZeroed<uint64_t>(pos,bytesLeft)) return false;
  if(!isZeroed<uint32_t>(pos,bytesLeft)) return false;
  if(!isZeroed<uint8_t>(pos,bytesLeft)) return false;
  return true;
}

struct T{
  int8_t  a1,b1,c1;
  int16_t a2;
  int32_t a3,b3,c3,c4;
  int64_t a4;
  int16_t a5;
};

int main(int argc, char * argv[])
{
  return 
    isZeroed(T{ 0,0,0,0,argc,0,0,0,0,0 }) &&
    isZeroed(static_cast<int8_t>(argc));
}

给予QWORD比较：

main:
        pxor    xmm0, xmm0
        movaps  XMMWORD PTR [rsp-56], xmm0
        mov     DWORD PTR [rsp-48], edi
        cmp     QWORD PTR [rsp-48], 0
        movaps  XMMWORD PTR [rsp-40], xmm0
        jne     .L6
        cmp     QWORD PTR [rsp-40], 0
        jne     .L6
        cmp     QWORD PTR [rsp-32], 0
        jne     .L6
        xor     eax, eax
        test    dil, dil
        sete    al
        ret
.L6:
        xor     eax, eax
        ret

Answer 2

当你告诉编译器查看BYTE大小的块时，它会这样做。如果您希望它看到DWORD或QWORD大小的块，您需要让它这样做。如下所示：

template<typename T>
bool isZeroed(T const & num) {
    void const * ptr = std::addressof(num);
    uint8_t const * pos = static_cast<uint8_t const *>(ptr);
    uint8_t const * const endpos = pos + sizeof(T);
    for (;pos < endpos; pos += sizeof(uint64_t))
    {
      if (*reinterpret_cast<const uint64_t*>(pos) != 0) 
        return false;
    }
    for (;pos < endpos; ++pos)
    {
      if (*pos != uint8_t(0))
        return false;
    }
    return true;
}

我们首先循环比较QWORD大小的块到零。如果找到任何非零值，则其中一个BYTE必须为非零，因此返回false。然后，我们再次循环处理任何剩余的BYTE大小的块。

这会生成您想要的代码。实际上，甚至更好 - 编译器知道您将QWORD大小的值传递给模板函数，因此它只使用TEST指令：

main:
    xor     eax, eax
    test    edi, edi
    sete    al
    ret

如果使用GCC扩展并传递128位整数值，则该函数将编译为以下内容，按预期进行两次QWORD比较：

main:
    movsx   rdi, edi
    cmp     QWORD PTR [rdi], 0
    jne     .L3
    cmp     QWORD PTR [rdi+8], 0
    sete    al
.L2:
    movzx   eax, al
    ret
.L3:
    xor     eax, eax
    jmp     .L2

请注意，非常不安全，从reinterpret_cast可以看出这一点。我也质疑这个用途。为所有整数大小编写模板特化 - 没有那么多，并且您可以在不影响安全性的情况下获得更好的代码。或者，如果您正在测试任意内存块，请使用memcmp - ，例如：

template<typename T>
bool isZeroed(T const & num) {
    void const * ptr = std::addressof(num);
    uint8_t temp[sizeof(T)] = { };
    return memcmp(temp, ptr, sizeof(T));
}

让gcc将一系列BYTE比较转换为WORD / DWORD / QWORD而不是

2 个答案: