Question

我已经看过一些文章，描述Vector<T>如何启用SIMD并使用JIT内在函数实现，因此编译器在使用时会正确输出AVS / SSE / ...指令，允许更快代码比经典的线性循环（例如here）。

我决定尝试重写一个方法，我必须看看我是否设法获得了一些加速，但到目前为止我失败了，矢量化代码的运行速度比原来快3倍，而且我并不完全确定至于为什么。以下是两个版本的方法，检查两个Span<float>实例是否具有相同位置的所有项目对，这些项目相对于阈值共享相同的位置。

// Classic implementation
public static unsafe bool MatchElementwiseThreshold(this Span<float> x1, Span<float> x2, float threshold)
{
    fixed (float* px1 = &x1.DangerousGetPinnableReference(), px2 = &x2.DangerousGetPinnableReference())
        for (int i = 0; i < x1.Length; i++)
            if (px1[i] > threshold != px2[i] > threshold)
                return false;
    return true;
}

// Vectorized
public static unsafe bool MatchElementwiseThresholdSIMD(this Span<float> x1, Span<float> x2, float threshold)
{
    // Setup the test vector
    int l = Vector<float>.Count;
    float* arr = stackalloc float[l];
    for (int i = 0; i < l; i++)
        arr[i] = threshold;
    Vector<float> cmp = Unsafe.Read<Vector<float>>(arr);
    fixed (float* px1 = &x1.DangerousGetPinnableReference(), px2 = &x2.DangerousGetPinnableReference())
    {
        // Iterate in chunks
        int
            div = x1.Length / l,
            mod = x1.Length % l,
            i = 0,
            offset = 0;
        for (; i < div; i += 1, offset += l)
        {
            Vector<float>
                v1 = Unsafe.Read<Vector<float>>(px1 + offset),
                v1cmp = Vector.GreaterThan<float>(v1, cmp),
                v2 = Unsafe.Read<Vector<float>>(px2 + offset),
                v2cmp = Vector.GreaterThan<float>(v2, cmp);
            float*
                pcmp1 = (float*)Unsafe.AsPointer(ref v1cmp),
                pcmp2 = (float*)Unsafe.AsPointer(ref v2cmp);
            for (int j = 0; j < l; j++)
                if (pcmp1[j] == 0 != (pcmp2[j] == 0))
                    return false;
        }

        // Test the remaining items, if any
        if (mod == 0) return true;
        for (i = x1.Length - mod; i < x1.Length; i++)
            if (px1[i] > threshold != px2[i] > threshold)
                return false;
    }
    return true;
}

正如我所说，我使用BenchmarkDotNet测试了两个版本，而使用Vector<T>的版本运行速度比另一个慢约3倍。我尝试使用不同长度的跨度（从大约100到超过2000）运行测试，但是矢量化方法比另一个慢得多。

我错过了一些明显的东西吗？

谢谢！

编辑：我之所以使用不安全的代码并尝试尽可能优化此代码而不进行并行化，原因是此方法已在{{1}内调用迭代。

另外，能够在多个线程上并行化代码通常不是让单个并行任务不优化的好理由。

Answer 1

我有同样的问题。解决方案是在项目属性中取消选中首选32位选项。

SIMD仅对64位进程启用。因此，请确保您的应用直接针对x64，或者被编译为“任何CPU”且未标记为“首选32位”。 [Source]

Answer 2

**编辑**阅读a blog post by Marc Gravell后，我发现这可以简单地实现......

public static bool MatchElementwiseThresholdSIMD(ReadOnlySpan<float> x1, ReadOnlySpan<float> x2, float threshold)
{
    if (x1.Length != x2.Length) throw new ArgumentException("x1.Length != x2.Length");

    if (Vector.IsHardwareAccelerated)
    {
        var vx1 = x1.NonPortableCast<float, Vector<float>>();
        var vx2 = x2.NonPortableCast<float, Vector<float>>();

        var vthreshold = new Vector<float>(threshold);
        for (int i = 0; i < vx1.Length; ++i)
        {
            var v1cmp = Vector.GreaterThan(vx1[i], vthreshold);
            var v2cmp = Vector.GreaterThan(vx2[i], vthreshold);
            if (Vector.Xor(v1cmp, v2cmp) != Vector<int>.Zero)
                return false;
        }

        x1 = x1.Slice(Vector<float>.Count * vx1.Length);
        x2 = x2.Slice(Vector<float>.Count * vx2.Length);
    }

    for (var i = 0; i < x1.Length; i++)
        if (x1[i] > threshold != x2[i] > threshold)
            return false;

    return true;
}

现在这不像直接使用数组那么快（如果这就是你所拥有的），但仍然比非SIMD版本快得多......

（另一个编辑......）

...只是为了好玩我觉得我会很好地看到这些东西在完全通用时处理工作，答案非常好......所以你可以编写如下代码，它就像存在一样有效特定的（除非在非硬件加速的情况下，在这种情况下，它的速度慢一点，但不是完全可怕的 ......）

    public static bool MatchElementwiseThreshold<T>(ReadOnlySpan<T> x1, ReadOnlySpan<T> x2, T threshold)
        where T : struct
    {
        if (x1.Length != x2.Length)
            throw new ArgumentException("x1.Length != x2.Length");

        if (Vector.IsHardwareAccelerated)
        {
            var vx1 = x1.NonPortableCast<T, Vector<T>>();
            var vx2 = x2.NonPortableCast<T, Vector<T>>();

            var vthreshold = new Vector<T>(threshold);
            for (int i = 0; i < vx1.Length; ++i)
            {
                var v1cmp = Vector.GreaterThan(vx1[i], vthreshold);
                var v2cmp = Vector.GreaterThan(vx2[i], vthreshold);
                if (Vector.AsVectorInt32(Vector.Xor(v1cmp, v2cmp)) != Vector<int>.Zero)
                    return false;
            }

            // slice them to handling remaining elementss
            x1 = x1.Slice(Vector<T>.Count * vx1.Length);
            x2 = x2.Slice(Vector<T>.Count * vx1.Length);
        }

        var comparer = System.Collections.Generic.Comparer<T>.Default;
        for (int i = 0; i < x1.Length; i++)
            if ((comparer.Compare(x1[i], threshold) > 0) != (comparer.Compare(x2[i], threshold) > 0))
                return false;

        return true;
    }

Answer 3

矢量只是一个矢量。它不声称或保证使用SIMD扩展。使用

System.Numerics.Vector2

https://docs.microsoft.com/en-us/dotnet/standard/numerics#simd-enabled-vector-types

使用Vector <t>运行比经典循环慢的SIMD矢量化C＃代码

3 个答案: