在32位架构上优化便携式128位整数移位

时间:2017-03-18 04:19:13

标签: c++ c++11 optimization integer unsigned-integer

在业余时间,我一直在研究一个实用程序库,除其他外,它支持有符号/无符号的128位整数。在某些情况下,此库使用cpu-dispatching来使用simd指令,但需要可移植的后备,因此它将在其他任何地方运行。最近我实现了128位移位的便携式回退。它运行正常,运行速度相当快,但速度并不像我想象的那么快,尤其是在32位架构上。

这是一个带有所有相关类型和功能的剥离版本(包括完整性的64位版本):

typedef uint32_t UInt32;
typedef int32_t Int32;
typedef uint64_t UInt64;
typedef int64_t Int64;

// Returns 0xFFFFFFFF if value != 0, otherwise returns 0.
UInt32 AllOrNothingMask32(Int32 value)
{
    return UInt32(-Int32(value != 0));
}

struct alignas(16) UInt128
{     
    // Ensure the layout matches the architecture.
    // LE = little endian
    // BE = big endian
#if CPU_TYPE == CPU_LE32
    UInt32 mLow;
    UInt32 mLowMid;
    UInt32 mHighMid;
    UInt32 mHigh;
#elif CPU_TYPE == CPU_BE32
    UInt32 mHigh;
    UInt32 mHighMid;
    UInt32 mLowMid;
    UInt32 mLow;
#elif CPU_TYPE == CPU_LE64
    UInt64 mLow;
    UInt64 mHigh;
#elif CPU_TYPE == CPU_BE64
    UInt64 mHigh;
    UInt64 mLow;
#endif

    UInt128() = default;

    UInt128& operator=(const UInt128& other) = default;

    inline
    UInt128(UInt32 high, UInt32 highMid, UInt32 lowMid, UInt32 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(low),
        mLowMid(lowMid),
        mHighMid(highMid),
        mHigh(high) { }
#elif CPU_SIZE == CPU_64BIT
        mLow((UInt64(lowMid) << 32) | low),
        mHigh((UInt64(high) << 32) | highMid) { }
#endif

    inline
    UInt128(UInt64 high, UInt64 low) :
#if CPU_SIZE == CPU_32BIT
        mLow(UInt32(low)),
        mLowMid(UInt32(low >> 32)),
        mHighMid(UInt32(high)),
        mHigh(UInt32(high >> 32)) { }
#elif CPU_SIZE == CPU_64BIT
        mLow(low),
        mHigh(high) { }
#endif

    inline
    bool UInt128::operator==(const UInt128& other) const noexcept
    {
#if CPU_TYPE == CPU_32BIT
        return mLow == other.mLow &&
               mLowMid == other.mLowMid &&
               mHighMid == other.mHighMid &&
               mHigh == other.mHigh;
#elif CPU_TYPE == CPU_64BIT
        return mLow == other.mLow &&
               mHigh == other.mHigh;
#endif
    }

    inline
    UInt128& UInt128::operator<<=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow;
        auto lowMid = mLowMid;
        auto highMid = mHighMid;
        auto high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low << shift);
            mLowMid = (lowMid << shift) | (low >> rshift);
            mHighMid = (highMid << shift) | (lowMid >> rshift);
            mHigh = (high << shift) | (highMid >> rshift);
        } else if (shift < 64) {
            auto lshift = (shift - 32);
            auto rshift = (32 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = (low << lshift);
            mHighMid = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
            mHigh = (highMid << lshift) | ((lowMid >> rshift) & rshiftMask);
        } else if (shift < 96) {
            auto lshift = (shift - 64);
            auto rshift = (64 - lshift) & 0x1F;
            auto rshiftMask = AllOrNothingMask32(rshift);
            mLow = 0;
            mLowMid = 0;
            mHighMid = (low << lshift);
            mHigh = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
        } else {
            mLow = 0;
            mLowMid = 0;
            mHighMid = 0;
            mHigh = (low << (shift - 96));
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low << shift);
            mHigh = (high << shift) | (low >> (64 - shift));
        } else {
            mLow = 0;
            mHigh = (low << (shift - 64));
        }
#endif
        return *this;
    }

    inline
    UInt128& UInt128::operator>>=(Int32 shift) noexcept
    {
        // Shift is modulo 128, effectively clamping it between 0-127.
        shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
        auto low = mLow,
             lowMid = mLowMid,
             highMid = mHighMid,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 32) {
            auto rshift = 32 - shift;
            mLow = (low >> shift) | (lowMid << rshift);
            mLowMid = (lowMid >> shift) | (highMid << rshift);
            mHighMid = (highMid >> shift) | (high << rshift);
            mHigh = (high >> shift);
        } else if (shift < 64) {
            auto rshift = (shift - 32);
            auto lshift = (32 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (lowMid >> rshift) | ((highMid << lshift) & lshiftMask);
            mLowMid = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mHighMid = (high >> rshift);
            mHigh = 0;
        } else if (shift < 96) {
            auto rshift = (shift - 64);
            auto lshift = (64 - rshift) & 0x1F;
            auto lshiftMask = AllOrNothingMask32(lshift);
            mLow = (highMid >> rshift) | ((high << lshift) & lshiftMask);
            mLowMid = (high >> rshift);
            mHighMid = 0;
            mHigh = 0;
        } else {
            mLow = (high >> (shift - 96));
            mLowMid = 0;
            mHighMid = 0;
            mHigh = 0;
        }
#elif CPU_SIZE == CPU_64BIT
        auto low = mLow,
             high = mHigh;

        if (shift == 0) {
            return *this;
        } else if (shift < 64) {
            mLow = (low >> shift) | (high << (64 - shift));
            mHigh = (high >> shift);
        } else {
            mLow = (high >> (shift - 64));
            mHigh = 0;
        }
#endif
        return *this;
    }
};

相关的32位汇编输出相当长,所以除非要求我省略它。

当在编译时不知道shift参数时,主要的瓶颈显然是分支。可以做些什么来消除分支,或者可以用什么便携式技巧来加速分支呢?

更新1

添加了上例中缺少的复制赋值运算符。 对于那些感兴趣的人,这里是单元测试。我正在使用Catch,因为它很简单。

// Left shift lookup table, from 0-127.
const UInt128 gLeftShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200),
    UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400),
    UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800),
    UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000),
    UInt128(0xFDDBB9977553310E, 0xECCAA88664422000),
    UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000),
    UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000),
    UInt128(0xEEDDCCBBAA998877, 0x6655443322110000),
    UInt128(0xDDBB9977553310EE, 0xCCAA886644220000),
    UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000),
    UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000),
    UInt128(0xEDDCCBBAA9988776, 0x6554433221100000),
    UInt128(0xDBB9977553310EEC, 0xCAA8866442200000),
    UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000),
    UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000),
    UInt128(0xDDCCBBAA99887766, 0x5544332211000000),
    UInt128(0xBB9977553310EECC, 0xAA88664422000000),
    UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000),
    UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000),
    UInt128(0xDCCBBAA998877665, 0x5443322110000000),
    UInt128(0xB9977553310EECCA, 0xA886644220000000),
    UInt128(0x732EEAA6621DD995, 0x510CC88440000000),
    UInt128(0xE65DD54CC43BB32A, 0xA219910880000000),
    UInt128(0xCCBBAA9988776655, 0x4433221100000000),
    UInt128(0x9977553310EECCAA, 0x8866442200000000),
    UInt128(0x32EEAA6621DD9955, 0x10CC884400000000),
    UInt128(0x65DD54CC43BB32AA, 0x2199108800000000),
    UInt128(0xCBBAA99887766554, 0x4332211000000000),
    UInt128(0x977553310EECCAA8, 0x8664422000000000),
    UInt128(0x2EEAA6621DD99551, 0xCC8844000000000),
    UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000),
    UInt128(0xBBAA998877665544, 0x3322110000000000),
    UInt128(0x77553310EECCAA88, 0x6644220000000000),
    UInt128(0xEEAA6621DD995510, 0xCC88440000000000),
    UInt128(0xDD54CC43BB32AA21, 0x9910880000000000),
    UInt128(0xBAA9988776655443, 0x3221100000000000),
    UInt128(0x7553310EECCAA886, 0x6442200000000000),
    UInt128(0xEAA6621DD995510C, 0xC884400000000000),
    UInt128(0xD54CC43BB32AA219, 0x9108800000000000),
    UInt128(0xAA99887766554433, 0x2211000000000000),
    UInt128(0x553310EECCAA8866, 0x4422000000000000),
    UInt128(0xAA6621DD995510CC, 0x8844000000000000),
    UInt128(0x54CC43BB32AA2199, 0x1088000000000000),
    UInt128(0xA998877665544332, 0x2110000000000000),
    UInt128(0x53310EECCAA88664, 0x4220000000000000),
    UInt128(0xA6621DD995510CC8, 0x8440000000000000),
    UInt128(0x4CC43BB32AA21991, 0x880000000000000),
    UInt128(0x9988776655443322, 0x1100000000000000),
    UInt128(0x3310EECCAA886644, 0x2200000000000000),
    UInt128(0x6621DD995510CC88, 0x4400000000000000),
    UInt128(0xCC43BB32AA219910, 0x8800000000000000),
    UInt128(0x9887766554433221, 0x1000000000000000),
    UInt128(0x310EECCAA8866442, 0x2000000000000000),
    UInt128(0x621DD995510CC884, 0x4000000000000000),
    UInt128(0xC43BB32AA2199108, 0x8000000000000000),
    UInt128(0x8877665544332211, 0x0),
    UInt128(0x10EECCAA88664422, 0x0),
    UInt128(0x21DD995510CC8844, 0x0),
    UInt128(0x43BB32AA21991088, 0x0),
    UInt128(0x8776655443322110, 0x0),
    UInt128(0xEECCAA886644220 , 0x0),
    UInt128(0x1DD995510CC88440, 0x0),
    UInt128(0x3BB32AA219910880, 0x0),
    UInt128(0x7766554433221100, 0x0),
    UInt128(0xEECCAA8866442200, 0x0),
    UInt128(0xDD995510CC884400, 0x0),
    UInt128(0xBB32AA2199108800, 0x0),
    UInt128(0x7665544332211000, 0x0),
    UInt128(0xECCAA88664422000, 0x0),
    UInt128(0xD995510CC8844000, 0x0),
    UInt128(0xB32AA21991088000, 0x0),
    UInt128(0x6655443322110000, 0x0),
    UInt128(0xCCAA886644220000, 0x0),
    UInt128(0x995510CC88440000, 0x0),
    UInt128(0x32AA219910880000, 0x0),
    UInt128(0x6554433221100000, 0x0),
    UInt128(0xCAA8866442200000, 0x0),
    UInt128(0x95510CC884400000, 0x0),
    UInt128(0x2AA2199108800000, 0x0),
    UInt128(0x5544332211000000, 0x0),
    UInt128(0xAA88664422000000, 0x0),
    UInt128(0x5510CC8844000000, 0x0),
    UInt128(0xAA21991088000000, 0x0),
    UInt128(0x5443322110000000, 0x0),
    UInt128(0xA886644220000000, 0x0),
    UInt128(0x510CC88440000000, 0x0),
    UInt128(0xA219910880000000, 0x0),
    UInt128(0x4433221100000000, 0x0),
    UInt128(0x8866442200000000, 0x0),
    UInt128(0x10CC884400000000, 0x0),
    UInt128(0x2199108800000000, 0x0),
    UInt128(0x4332211000000000, 0x0),
    UInt128(0x8664422000000000, 0x0),
    UInt128(0xCC8844000000000 , 0x0),
    UInt128(0x1991088000000000, 0x0),
    UInt128(0x3322110000000000, 0x0),
    UInt128(0x6644220000000000, 0x0),
    UInt128(0xCC88440000000000, 0x0),
    UInt128(0x9910880000000000, 0x0),
    UInt128(0x3221100000000000, 0x0),
    UInt128(0x6442200000000000, 0x0),
    UInt128(0xC884400000000000, 0x0),
    UInt128(0x9108800000000000, 0x0),
    UInt128(0x2211000000000000, 0x0),
    UInt128(0x4422000000000000, 0x0),
    UInt128(0x8844000000000000, 0x0),
    UInt128(0x1088000000000000, 0x0),
    UInt128(0x2110000000000000, 0x0),
    UInt128(0x4220000000000000, 0x0),
    UInt128(0x8440000000000000, 0x0),
    UInt128(0x880000000000000 , 0x0),
    UInt128(0x1100000000000000, 0x0),
    UInt128(0x2200000000000000, 0x0),
    UInt128(0x4400000000000000, 0x0),
    UInt128(0x8800000000000000, 0x0),
    UInt128(0x1000000000000000, 0x0),
    UInt128(0x2000000000000000, 0x0),
    UInt128(0x4000000000000000, 0x0),
    UInt128(0x8000000000000000, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0),
    UInt128(0x0, 0x0)
};
// Right shift lookup table, from 0-127.
const UInt128 gRightShiftLut128[] = {
    UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
    UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880),
    UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440),
    UInt128(0x1FFDDBB997755331, 0xEECCAA886644220),
    UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110),
    UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088),
    UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844),
    UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422),
    UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211),
    UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108),
    UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884),
    UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442),
    UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221),
    UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910),
    UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88),
    UInt128(0x1FFDDBB997755, 0x3310EECCAA886644),
    UInt128(0xFFEEDDCCBBAA, 0x9988776655443322),
    UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991),
    UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8),
    UInt128(0x1FFDDBB99775, 0x53310EECCAA88664),
    UInt128(0xFFEEDDCCBBA, 0xA998877665544332),
    UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199),
    UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC),
    UInt128(0x1FFDDBB9977, 0x553310EECCAA8866),
    UInt128(0xFFEEDDCCBB, 0xAA99887766554433),
    UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219),
    UInt128(0x3FFBB7732E, 0xEAA6621DD995510C),
    UInt128(0x1FFDDBB997, 0x7553310EECCAA886),
    UInt128(0xFFEEDDCCB, 0xBAA9988776655443),
    UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21),
    UInt128(0x3FFBB7732, 0xEEAA6621DD995510),
    UInt128(0x1FFDDBB99, 0x77553310EECCAA88),
    UInt128(0xFFEEDDCC, 0xBBAA998877665544),
    UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2),
    UInt128(0x3FFBB773, 0x2EEAA6621DD99551),
    UInt128(0x1FFDDBB9, 0x977553310EECCAA8),
    UInt128(0xFFEEDDC, 0xCBBAA99887766554),
    UInt128(0x7FF76EE, 0x65DD54CC43BB32AA),
    UInt128(0x3FFBB77, 0x32EEAA6621DD9955),
    UInt128(0x1FFDDBB, 0x9977553310EECCAA),
    UInt128(0xFFEEDD, 0xCCBBAA9988776655),
    UInt128(0x7FF76E, 0xE65DD54CC43BB32A),
    UInt128(0x3FFBB7, 0x732EEAA6621DD995),
    UInt128(0x1FFDDB, 0xB9977553310EECCA),
    UInt128(0xFFEED, 0xDCCBBAA998877665),
    UInt128(0x7FF76, 0xEE65DD54CC43BB32),
    UInt128(0x3FFBB, 0x7732EEAA6621DD99),
    UInt128(0x1FFDD, 0xBB9977553310EECC),
    UInt128(0xFFEE, 0xDDCCBBAA99887766),
    UInt128(0x7FF7, 0x6EE65DD54CC43BB3),
    UInt128(0x3FFB, 0xB7732EEAA6621DD9),
    UInt128(0x1FFD, 0xDBB9977553310EEC),
    UInt128(0xFFE, 0xEDDCCBBAA9988776),
    UInt128(0x7FF, 0x76EE65DD54CC43BB),
    UInt128(0x3FF, 0xBB7732EEAA6621DD),
    UInt128(0x1FF, 0xDDBB9977553310EE),
    UInt128(0xFF, 0xEEDDCCBBAA998877),
    UInt128(0x7F, 0xF76EE65DD54CC43B),
    UInt128(0x3F, 0xFBB7732EEAA6621D),
    UInt128(0x1F, 0xFDDBB9977553310E),
    UInt128(0xF, 0xFEEDDCCBBAA99887),
    UInt128(0x7, 0xFF76EE65DD54CC43),
    UInt128(0x3, 0xFFBB7732EEAA6621),
    UInt128(0x1, 0xFFDDBB9977553310),
    UInt128(0x0, 0xFFEEDDCCBBAA9988),
    UInt128(0x0, 0x7FF76EE65DD54CC4),
    UInt128(0x0, 0x3FFBB7732EEAA662),
    UInt128(0x0, 0x1FFDDBB997755331),
    UInt128(0x0, 0xFFEEDDCCBBAA998),
    UInt128(0x0, 0x7FF76EE65DD54CC),
    UInt128(0x0, 0x3FFBB7732EEAA66),
    UInt128(0x0, 0x1FFDDBB99775533),
    UInt128(0x0, 0xFFEEDDCCBBAA99),
    UInt128(0x0, 0x7FF76EE65DD54C),
    UInt128(0x0, 0x3FFBB7732EEAA6),
    UInt128(0x0, 0x1FFDDBB9977553),
    UInt128(0x0, 0xFFEEDDCCBBAA9),
    UInt128(0x0, 0x7FF76EE65DD54),
    UInt128(0x0, 0x3FFBB7732EEAA),
    UInt128(0x0, 0x1FFDDBB997755),
    UInt128(0x0, 0xFFEEDDCCBBAA),
    UInt128(0x0, 0x7FF76EE65DD5),
    UInt128(0x0, 0x3FFBB7732EEA),
    UInt128(0x0, 0x1FFDDBB99775),
    UInt128(0x0, 0xFFEEDDCCBBA),
    UInt128(0x0, 0x7FF76EE65DD),
    UInt128(0x0, 0x3FFBB7732EE),
    UInt128(0x0, 0x1FFDDBB9977),
    UInt128(0x0, 0xFFEEDDCCBB),
    UInt128(0x0, 0x7FF76EE65D),
    UInt128(0x0, 0x3FFBB7732E),
    UInt128(0x0, 0x1FFDDBB997),
    UInt128(0x0, 0xFFEEDDCCB),
    UInt128(0x0, 0x7FF76EE65),
    UInt128(0x0, 0x3FFBB7732),
    UInt128(0x0, 0x1FFDDBB99),
    UInt128(0x0, 0xFFEEDDCC),
    UInt128(0x0, 0x7FF76EE6),
    UInt128(0x0, 0x3FFBB773),
    UInt128(0x0, 0x1FFDDBB9),
    UInt128(0x0, 0xFFEEDDC),
    UInt128(0x0, 0x7FF76EE),
    UInt128(0x0, 0x3FFBB77),
    UInt128(0x0, 0x1FFDDBB),
    UInt128(0x0, 0xFFEEDD),
    UInt128(0x0, 0x7FF76E),
    UInt128(0x0, 0x3FFBB7),
    UInt128(0x0, 0x1FFDDB),
    UInt128(0x0, 0xFFEED),
    UInt128(0x0, 0x7FF76),
    UInt128(0x0, 0x3FFBB),
    UInt128(0x0, 0x1FFDD),
    UInt128(0x0, 0xFFEE),
    UInt128(0x0, 0x7FF7),
    UInt128(0x0, 0x3FFB),
    UInt128(0x0, 0x1FFD),
    UInt128(0x0, 0xFFE),
    UInt128(0x0, 0x7FF),
    UInt128(0x0, 0x3FF),
    UInt128(0x0, 0x1FF),
    UInt128(0x0, 0xFF),
    UInt128(0x0, 0x7F),
    UInt128(0x0, 0x3F),
    UInt128(0x0, 0x1F),
    UInt128(0x0, 0xF),
    UInt128(0x0, 0x7),
    UInt128(0x0, 0x3),
    UInt128(0x0, 0x1)
};

TEST_CASE("UInt128 left shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 1; i <= 127; i++) {
        auto sample = base;
        sample <<= i;

        INFO("i = " << i);
        REQUIRE(sample == gLeftShiftLut128[i]);
    }
}

TEST_CASE("UInt128 right shift produces correct results.") {
    auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);

    for (auto i = 0; i <= 127; i++) {
        auto sample = base;
        sample >>= i;

        INFO("i = " << i);
        REQUIRE(sample == gRightShiftLut128[i]);
    }
}

2 个答案:

答案 0 :(得分:1)

我没有对它进行基准测试,但这样的事情是无分支的:

inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
    auto lshift = shift & 31;
    auto rshift = 31 - lshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
        0, 0, 0, 0,
        mLow << lshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mHigh << lshift | mHighMid >> 1 >> rshift
#elif CPU_TYPE == CPU_BE32
        mHigh << lshift | mHighMid >> 1 >> rshift,
        mHighMid << lshift | mLowMid >> 1 >> rshift,
        mLowMid << lshift | mLow >> 1 >> rshift,
        mLow << lshift,
        0, 0, 0, 0
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_LE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}

inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
    auto rshift = shift & 31;
    auto lshift = 31 - rshift;
    UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
    mLow >> rshift | mMidLow << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mHigh >> rshift,
    0, 0, 0, 0
#elif CPU_TYPE == CPU_BE32
    0, 0, 0, 0,
    mHigh >> rshift,
    mMidHigh >> rshift | mHigh << lshift << 1,
    mMidLow >> rshift | mMidHigh << lshift << 1,
    mLow >> rshift | mMidLow << lshift << 1
#endif
    };
    memcpy(this, &parts[
#if CPU_TYPE == CPU_BE32
        4 -
#endif
        (shift >> 5 & 3)], 16);
    return *this;
}

答案 1 :(得分:1)

我认为使用数组可以更好地实现UInt128,其中字节顺序不是问题,例如,

alignas (16) uint32_t data[4];或:alignas (16) uint64_t data[2];

请注意,对于在堆上创建的对象,不保证对齐;虽然有些ABI确实有16字节的最小对齐。您可以查看alignof(std::max_align_t)。如果没有,您将需要替换SIMD的全局运算符new和delete函数(例如,SSE)。

对于uint32_t实施,您将转换分为&#39; word&#39;和&#39; bit&#39;轮班 - 也就是说,将签名 Int32作为轮班计数是没有意义的......

inline UInt128 &
UInt128::operator <<= (uint32_t shift) noexcept
{
    shift &= 0x7f;
    auto shw = shift / (32); // or (shift >> 5)
    auto shl = shift % (32); // or (shift & 1f)

    // branch-free shift masking:

    uint32_t shm = shl - 1;
    uint32_t shr = (- shl) & (32 - 1);
    shm = (shm >> (32 - 1)) - 1; // 0xffffffff or 0x0

    switch (shw)
    {
    case (3) :
        data[3] = (data[0] << shl);
        data[2] = 0, data[1] = 0, data[0] = 0;
        break;

    case (2) :
        data[3] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[2] = (data[0] << shl);
        data[1] = 0, data[0] = 0;
        break;

    case (1) :
        data[3] = (data[2] << shl) | ((data[1] >> shr) & shm);
        data[2] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[1] = (data[0] << shl);
        data[0] = 0;
        break;

    case (0) : // default:
        data[3] = (data[3] << shl) | ((data[2] >> shr) & shm);
        data[2] = (data[2] << shl) | ((data[1] >> shr) & shm);
        data[1] = (data[1] << shl) | ((data[0] >> shr) & shm);
        data[0] = (data[0] << shl);
        // break;
    }

    return *this;
}

我很确定我的数据索引是正确的。如果shift是编译时常量,编译器应该能够非常积极地优化此代码。

我将正确转移给您,除非您必须将data从低字更新为高字,以便在读取之前不会覆盖这些字。否则,交换shlshr的角色应该是一个简单的练习。 uint64_t数据版本应该非常简单。