在int中操作单个字节的最快方法

时间:2015-05-02 22:30:40

标签: c# performance unsafe

我发现我的应用程序花了25%的时间在循环中执行此操作:

private static int Diff (int c0, int c1)
{
    unsafe {
        byte* pc0 = (byte*) &c0;
        byte* pc1 = (byte*) &c1;
        int d0 = pc0[0] - pc1[0];
        int d1 = pc0[1] - pc1[1];
        int d2 = pc0[2] - pc1[2];
        int d3 = pc0[3] - pc1[3];
        d0 *= d0;
        d1 *= d1;
        d2 *= d2;
        d3 *= d3;
        return d0 + d1 + d2 + d3;
    }
}

如何改善此方法的性能?到目前为止我的想法:

  1. 最明显的是,这会受益于SIMD,但让我们假设我不想去那里,因为这有点麻烦。
  2. 同样适用于较低级别的东西(调用C库,在GPGPU上执行)
  3. 多线程 - 我会使用它。
  4. 编辑:为了您的方便,一些反映真实环境和用例的测试代码。 (实际上,涉及的数据更多,数据不是在单个大块中进行比较,而是在每个kb的许多块中进行比较。)

    public static class ByteCompare
    {
        private static void Main ()
        {
            const int n = 1024 * 1024 * 20;
            const int repeat = 20;
            var rnd = new Random (0);
    
            Console.Write ("Generating test data... ");
            var t0 = Enumerable.Range (1, n)
                .Select (x => rnd.Next (int.MinValue, int.MaxValue))
                .ToArray ();
            var t1 = Enumerable.Range (1, n)
                .Select (x => rnd.Next (int.MinValue, int.MaxValue))
                .ToArray ();
            Console.WriteLine ("complete.");
            GC.Collect (2, GCCollectionMode.Forced);
            Console.WriteLine ("GCs: " + GC.CollectionCount (0));
    
            {
                var sw = Stopwatch.StartNew ();
                long res = 0;
                for (int reps = 0; reps < repeat; reps++) {
                    for (int i = 0; i < n; i++) {
                        int c0 = t0[i];
                        int c1 = t1[i];
                        res += ByteDiff_REGULAR (c0, c1);
                    }
                }
                sw.Stop ();
                Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_REGULAR");
            }
            {
                var sw = Stopwatch.StartNew ();
                long res = 0;
                for (int reps = 0; reps < repeat; reps++) {
                    for (int i = 0; i < n; i++) {
                        int c0 = t0[i];
                        int c1 = t1[i];
                        res += ByteDiff_UNSAFE (c0, c1);
                    }
                }
                sw.Stop ();
                Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_UNSAFE_PTR");
            }
    
            Console.WriteLine ("GCs: " + GC.CollectionCount (0));
            Console.WriteLine ("Test complete.");
            Console.ReadKey (true);
        }
    
        public static int ByteDiff_REGULAR (int c0, int c1)
        {
            var c00 = (byte) (c0 >> (8 * 0));
            var c01 = (byte) (c0 >> (8 * 1));
            var c02 = (byte) (c0 >> (8 * 2));
            var c03 = (byte) (c0 >> (8 * 3));
            var c10 = (byte) (c1 >> (8 * 0));
            var c11 = (byte) (c1 >> (8 * 1));
            var c12 = (byte) (c1 >> (8 * 2));
            var c13 = (byte) (c1 >> (8 * 3));
            var d0 = (c00 - c10);
            var d1 = (c01 - c11);
            var d2 = (c02 - c12);
            var d3 = (c03 - c13);
            d0 *= d0;
            d1 *= d1;
            d2 *= d2;
            d3 *= d3;
            return d0 + d1 + d2 + d3;
        }
    
        private static int ByteDiff_UNSAFE (int c0, int c1)
        {
            unsafe {
                byte* pc0 = (byte*) &c0;
                byte* pc1 = (byte*) &c1;
                int d0 = pc0[0] - pc1[0];
                int d1 = pc0[1] - pc1[1];
                int d2 = pc0[2] - pc1[2];
                int d3 = pc0[3] - pc1[3];
                d0 *= d0;
                d1 *= d1;
                d2 *= d2;
                d3 *= d3;
                return d0 + d1 + d2 + d3;
            }
        }
    }
    

    对我来说(在i5上作为x64版本运行):

    Generating test data... complete.
    GCs: 8
    res=18324555528140, t=1.46s - ByteDiff_REGULAR
    res=18324555528140, t=1.15s - ByteDiff_UNSAFE
    res=18324555528140, t=1.73s - Diff_Alex1
    res=18324555528140, t=1.63s - Diff_Alex2
    res=18324555528140, t=3.59s - Diff_Alex3
    res=18325828513740, t=3.90s - Diff_Alex4
    GCs: 8
    Test complete.
    

3 个答案:

答案 0 :(得分:4)

  

最明显的是,这会受益于SIMD,但让我们假设我不想去那里,因为这有点麻烦。

如果你愿意,可以避免使用它,但实际上它可以直接从C#中得到很好的支持。如果更大的算法适用于SIMD处理,我认为这是迄今为止最大的性能赢家。

http://www.drdobbs.com/architecture-and-design/simd-enabled-vector-types-with-c/240168888

  

多线程

当然,每个CPU核心使用一个线程。您还可以使用Parallel.For之类的构造,并让.NET整理出要使用的线程数。它非常擅长,但是既然你知道这肯定是CPU限制的,你可能(或可能不会)通过自己管理线程来获得更好的结果。

至于加速实际代码块,使用位屏蔽和位移来获取各个值可能会更快,而不是使用指针。这样做的另一个好处就是您不需要不安全的代码块,例如

byte b0_leftmost = (c0 & 0xff000000) >> 24;

答案 1 :(得分:1)

除了已经提到的SIMD选项并且并行运行多个操作之外,您是否尝试对主题的一些可能的实现变体进行基准测试?像下面的一些选项一样。

我差点忘了提一个非常重要的优化:

  • 添加using System.Runtime.CompilerServices;
  • [MethodImpl(MethodImplOptions.AggressiveInlining)]属性添加到您的方法中。

像这样:

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        byte* pc0 = (byte*)&c0;
        byte* pc1 = (byte*)&c1;
        int sum = 0;
        int dif = 0;
        for (var i = 0; i < 4; i++, pc0++, pc1++)
        {
            dif = *pc0 - *pc1;
            sum += (dif * dif);
        }
        return sum;
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unchecked
    {
        int sum = 0;
        int dif = 0;
        for (var i = 0; i < 4; i++)
        {
            dif = (c0 & 0xFF) - (c1 & 0xFF);
            c0 >>= 8;
            c1 >>= 8;
            sum += (dif * dif);
        }
        return sum;
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        int* difs = stackalloc int[4];
        byte* pc0 = (byte*)&c0;
        byte* pc1 = (byte*)&c1;
        difs[0] = pc0[0] - pc1[0];
        difs[1] = pc0[1] - pc1[1];
        difs[2] = pc0[2] - pc1[2];
        difs[3] = pc0[3] - pc1[3];
        return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
    }
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Diff(int c0, int c1)
{
    unsafe
    {
        int* difs = stackalloc int[4];
        difs[0] = (c0 >> 24) - (c1 >> 24);
        difs[1] = ((c0 >> 16) & 0xFF) - ((c1 >> 16) & 0xFF);
        difs[2] = ((c0 >> 8) & 0xFF) - ((c1 >> 8) & 0xFF);
        difs[3] = (c0 & 0xFF) - (c1  & 0xFF);
        return difs[0] * difs[0] + difs[1] * difs[1] + difs[2] * difs[2] + difs[3] * difs[3];
    }
}

答案 2 :(得分:1)

我试图减少IL指令数(看起来它是单线程,无SIMD代码的唯一选择)。此代码比我的机器上的描述快35%。另外我想你可以尝试通过Emit静态类自己生成IL指令。它可以为您提供更高的准确性。

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ByteDiff_UNSAFE_2 (int c0, int c1)
{
    unsafe {
        byte* pc0 = (byte*) &c0;
        byte* pc1 = (byte*) &c1;
        int d0 = pc0[0] - pc1[0];
        d0 *= d0;
        int d1 = pc0[1] - pc1[1];
        d0 += d1 * d1;
        int d2 = pc0[2] - pc1[2];
        d0 += d2 * d2;
        int d3 = pc0[3] - pc1[3];
        return d0 + d3 * d3;
    }
}