Question

根据this answer，我创建了以下测试程序：

#include <iso646.h>
#include <immintrin.h>

#include <stdio.h>

#define SHIFT_LEFT( N ) \ 
\
    inline __m256i shift_left_##N ( __m256i A  ) { \
\
    if ( N == 0 ) return A; \
    else if ( N <  16 ) return _mm256_alignr_epi8 ( A, _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ), ( uint8_t ) ( 16 - N ) ); \
    else if ( N == 16 ) return _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ); \
    else return _mm256_slli_si256 ( _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ), ( uint8_t ) ( N - 16 ) ); \
}

void print ( const size_t n ) {

    size_t i = 0x8000000000000000;

    while ( i ) {

        putchar ( ( int ) ( n & i ) + ( int ) ( 48 ) );
        i >>= 1;
        putchar ( ( int ) ( n & i ) + ( int ) ( 48 ) );
        i >>= 1;

        putchar ( ' ' );
    }
}

SHIFT_LEFT ( 2 );

int main ( ) {

    __m256i a = _mm256_set_epi64x ( 0x00, 0x00, 0x00, 0x03 );
    __m256i b = shift_left_2 ( a );

    size_t * c = ( size_t * ) &b;

    print ( c [ 3 ] ); print ( c [ 2 ] ); print ( c [ 1 ] ); print ( c [ 0 ] ); putchar ( '\n' );

    return 0;
}

据我所知，上述程序没有给出预期的（由我）输出。我对这些功能如何协同工作感到难过（阅读说明）。我做错了什么，或者是shift_left（）的实现错了吗？

EDIT1：我开始意识到（并在评论中确认）这段代码只打算移动最多32（并且是字节），所以它不能满足我的目标。这留下了一个问题，“如何在AVX2中实现车道交叉逻辑按位移位（左和右）”。

EDIT2：快进：与此同时，我对它的运作方式不太感兴趣，并编码了我需要的东西。我已经发布了代码（转移和旋转）并接受了它作为答案。

Answer 1

可能不是您期望的那种答案。但这是一个合理有效的解决方案，实际上适用于运行时移位量。

费用如下：

预处理： ~12 - 14条说明
轮换： 5条说明
Shift： 6条说明

为了移动或旋转任何东西，您必须首先预先处理移位量。完成后，您可以有效地执行轮班/轮换。

由于预处理步骤非常昂贵，因此该解决方案利用一个对象来保持预处理的移位量，以便在移动相同数量时可以多次重复使用。

为了提高效率，对象应该在与移位代码相同的范围内。这允许编译器将对象的所有字段提升为寄存器。此外，建议强制内联该类的所有方法。

#include <stdint.h>
#include <immintrin.h>

class LeftShifter_AVX2{
public:
    LeftShifter_AVX2(uint32_t bits){
        //  Precompute all the necessary values.
        permL = _mm256_sub_epi32(
            _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
            _mm256_set1_epi32(bits / 32)
        );
        permR = _mm256_sub_epi32(permL, _mm256_set1_epi32(1));

        bits %= 32;
        shiftL = _mm_cvtsi32_si128(bits);
        shiftR = _mm_cvtsi32_si128(32 - bits);
        __m256i maskL = _mm256_cmpgt_epi32(_mm256_setzero_si256(), permL);
        __m256i maskR = _mm256_cmpgt_epi32(_mm256_setzero_si256(), permR);
        mask = _mm256_or_si256(maskL, _mm256_srl_epi32(maskR, shiftR));
    }

    __m256i rotate(__m256i x) const{
        __m256i L = _mm256_permutevar8x32_epi32(x, permL);
        __m256i R = _mm256_permutevar8x32_epi32(x, permR);
        L = _mm256_sll_epi32(L, shiftL);
        R = _mm256_srl_epi32(R, shiftR);
        return _mm256_or_si256(L, R);
    }
    __m256i shift(__m256i x) const{
        return _mm256_andnot_si256(mask, rotate(x));
    }

private:
    __m256i permL;
    __m256i permR;
    __m128i shiftL;
    __m128i shiftR;
    __m256i mask;
};

测试计划：

#include <iostream>
using namespace std;

void print_u8(__m256i x){
    union{
        __m256i v;
        uint8_t s[32];
    };
    v = x;
    for (int c = 0; c < 32; c++){
        cout << (int)s[c] << " ";
    }
    cout << endl;
}

int main(){
    union{
        __m256i x;
        char buffer[32];
    };
    for (int c = 0; c < 32; c++){
        buffer[c] = (char)c;
    }
    print_u8(x);
    print_u8(LeftShifter_AVX2(0).shift(x));
    print_u8(LeftShifter_AVX2(8).shift(x));
    print_u8(LeftShifter_AVX2(32).shift(x));
    print_u8(LeftShifter_AVX2(40).shift(x));
}

<强>输出：

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
0 0 0 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 
0 0 0 0 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26

右移非常相似。我将把它作为读者的练习。

Answer 2

以下代码在AVX2中实现了车道交叉逻辑按位移位/旋转（左和右）：

const string FILENAME = @"c:\temp\test.xml";
const string DATABASE = @"c:\temp\test1.xml";
public static void Main(string[] args)
{
    XDocument doc = XDocument.Load(FILENAME);

    XElement article = doc.Root;
    XNamespace ns = article.GetDefaultNamespace();

    XDocument docDatabase = XDocument.Load(DATABASE);
    XElement rdf = docDatabase.Root;
    XNamespace nsSkosxl = rdf.GetNamespaceOfPrefix("skosxl");
    XNamespace nsSkos = rdf.GetNamespaceOfPrefix("skos");
    XNamespace nsRdf = rdf.GetNamespaceOfPrefix("rdf");

    List<XElement> prefLabels = rdf.Descendants(nsSkos + "Concept").ToList();
    Dictionary<string, List<string>> dictLabels = prefLabels.GroupBy(x => (string)x.Descendants(nsSkosxl + "literalForm").FirstOrDefault(), y => (string)y.Parent.Element(nsSkos+"Concept").Attribute(nsRdf + "about").Value.Substring(18))
        .ToDictionary(x => x.Key, y => y.ToList());

    List<XElement> fundingSources = article.Descendants(ns + "funding-source").ToList();

    foreach (XElement fundingSource in fundingSources)
    {
        XElement institutionWrap = fundingSource.Element(ns + "institution-wrap");
        string institution = (string)fundingSource;

        if (dictLabels.ContainsKey(institution))
        {
            institutionWrap.Add(new XElement("institution-id", new object[] {
                                                new XAttribute("institution-id-type","fundref"),
                                                dictLabels[institution]
                                             }));
        }
    }
    doc.Save(FILENAME);
    Console.WriteLine("Done");
    Console.ReadLine();
}

我试图让_mm256_permute4x64_epi64操作（在任何情况下必须是两个）部分重叠，这应该将整体延迟保持在最低限度。

评论者给出的大多数建议或线索都有助于整理代码，感谢这些。显然，欢迎改进和/或任何其他意见。

我认为神秘的回答很有意思，但是太复杂了，无法有效地用于广义转移/旋转以供使用f.e.在图书馆里。

如何在AVX2中实现车道交叉逻辑按位移位/旋转（左和右）

2 个答案: