代码速度的差异并不重要,我想知道为什么。一种方法很短,另一种在我看来优化得更好,但在测试中,越短越差越快,所以如果有人知道为什么。
优化后的方法有点类似于 NET Framework 4.8 库中 Memmove 方法中的 Buffer 类。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.Versioning;
using System.Runtime.ConstrainedExecution;
using System.Diagnostics;
namespace test
{
public static class TestClass
{
const int MAX_CHARS = 101;
static int _cyclesLength = 10000000;
static char[] _source;
static TestClass()
{
_source = ("A computer is a machine that can be programmed to carry out sequences of arithmetic or logical operations automatically. Modern computers can perform generic "
+ "sets of operations known as programs. These programs enable computers to perform a wide range of tasks. A computer system is a complete computer that includes the "
+ "hardware, operating system (main software), and peripheral equipment needed and used for full operation. This term may also refer to a group of computers that are "
+ "linked and function together, such as a computer network or computer cluster. A broad range of industrial and consumer products use computers as control systems.Simple "
+ "special-purpose devices like microwave ovens and remote controls are included, as are factory devices like industrial robots and computer - aided design, as well as general "
+ "- purpose devices like personal computers and mobile devices like smartphones.Computers power the Internet, which links hundreds of millions of other computers and users.").ToArray();
}
[System.Security.SecurityCritical]
[ResourceExposure(ResourceScope.None)]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
public static unsafe void Copy(char* source, char* target, int length)
{
while (length >= 8)
{
*(int*)source = *(int*)target;
*(int*)(source + 2) = *(int*)(target + 2);
*(int*)(source + 4) = *(int*)(target + 4);
*(int*)(source + 6) = *(int*)(target + 6);
target += 8;
source += 8;
length -= 8;
}
while (length >= 2)
{
*(int*)source = *(int*)target;
target += 2;
source += 2;
length -= 2;
}
if (length > 0)
*source = *target;
}
[System.Security.SecurityCritical]
[ResourceExposure(ResourceScope.None)]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
public static unsafe void OptimizedCopy(char* source, char* target, int length)
{
if (length < 8)
{
switch (length)
{
case 0:
return;
case 1:
*target = *source;
return;
case 2:
*(int*)target = *(int*)source;
return;
case 3:
*(int*)target = *(int*)source;
*(target + 2) = *(source + 2);
return;
case 4:
*(int*)target = *(int*)source;
*(int*)(target + 2) = *(int*)(source + 2);
return;
case 5:
*(int*)target = *(int*)source;
*(int*)(target + 2) = *(int*)(source + 2);
*(target + 4) = *(source + 4);
return;
case 6:
*(int*)target = *(int*)source;
*(int*)(target + 2) = *(int*)(source + 2);
*(int*)(target + 4) = *(int*)(source + 4);
return;
case 7:
*(int*)target = *(int*)source;
*(int*)(target + 2) = *(int*)(source + 2);
*(int*)(target + 4) = *(int*)(source + 4);
*(target + 6) = *(source + 6);
return;
}
}
switch (length & 7)
{
case 0:
break;
case 1:
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 2:
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 3:
*(int*)(target + length - 4) = *(int*)(source + length - 4);
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 4:
*(int*)(target + length - 4) = *(int*)(source + length - 4);
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 5:
*(int*)(target + length - 6) = *(int*)(source + length - 6);
*(int*)(target + length - 4) = *(int*)(source + length - 4);
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 6:
*(int*)(target + length - 6) = *(int*)(source + length - 6);
*(int*)(target + length - 4) = *(int*)(source + length - 4);
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
case 7:
*(int*)(target + length - 8) = *(int*)(source + length - 8);
*(int*)(target + length - 6) = *(int*)(source + length - 6);
*(int*)(target + length - 4) = *(int*)(source + length - 4);
*(int*)(target + length - 2) = *(int*)(source + length - 2);
break;
}
while (true)
{
*(int*)target = *(int*)source;
*(int*)(target + 2) = *(int*)(source + 2);
*(int*)(target + 4) = *(int*)(source + 4);
*(int*)(target + 6) = *(int*)(source + 6);
if (length < 16) return;
*(int*)(target + 8) = *(int*)(source + 8);
*(int*)(target + 10) = *(int*)(source + 10);
*(int*)(target + 12) = *(int*)(source + 12);
*(int*)(target + 14) = *(int*)(source + 14);
if (length < 24) return;
*(int*)(target + 16) = *(int*)(source + 16);
*(int*)(target + 18) = *(int*)(source + 18);
*(int*)(target + 20) = *(int*)(source + 20);
*(int*)(target + 22) = *(int*)(source + 22);
if (length < 32) return;
*(int*)(target + 24) = *(int*)(source + 24);
*(int*)(target + 26) = *(int*)(source + 26);
*(int*)(target + 28) = *(int*)(source + 28);
*(int*)(target + 30) = *(int*)(source + 30);
if (length < 40) return;
*(int*)(target + 32) = *(int*)(source + 32);
*(int*)(target + 34) = *(int*)(source + 34);
*(int*)(target + 36) = *(int*)(source + 36);
*(int*)(target + 38) = *(int*)(source + 38);
if (length < 48) return;
*(int*)(target + 40) = *(int*)(source + 40);
*(int*)(target + 42) = *(int*)(source + 42);
*(int*)(target + 44) = *(int*)(source + 44);
*(int*)(target + 46) = *(int*)(source + 46);
if (length < 56) return;
source += 48;
target += 48;
length -= 48;
}
}
private static unsafe long TestCopy()
{
long cyclesLength = _cyclesLength;
char[] sourceArr = _source;
char[] targetArr = new char[MAX_CHARS];
fixed (char* source = sourceArr, target = targetArr)
{
for (long i = 0; i < cyclesLength; i++)
{
for (int j = 1; j <= MAX_CHARS; j++)
Copy(source, target, j);
}
}
return 1;
}
private static unsafe long TestOptimizedCopy()
{
long cyclesLength = _cyclesLength;
char[] sourceArr = _source;
char[] targetArr = new char[MAX_CHARS];
fixed (char* source = sourceArr, target = targetArr)
{
for (long i = 0; i < cyclesLength; i++)
{
for (int j = 1; j <= MAX_CHARS; j++)
OptimizedCopy(source, target, j);
}
}
return 1;
}
public static unsafe void TestMethod(long pocetCyklu = 0)
{
Stopwatch stopwatch = new Stopwatch();
//TestCopy
System.GC.Collect();
System.Threading.Thread.Sleep(1000);
stopwatch.Start();
TestCopy();
stopwatch.Stop();
Console.WriteLine( stopwatch.Elapsed.TotalSeconds.ToString() + " TestCopy");
//TestOptimizedCopy
System.GC.Collect();
System.Threading.Thread.Sleep(1000);
stopwatch.Restart();
TestOptimizedCopy();
stopwatch.Stop();
Console.WriteLine(stopwatch.Elapsed.TotalSeconds.ToString() + " TestOptimizedCopy");
}
}
}
在我看来,由于某种原因,更糟糕但更短的算法更快,但我不知道为什么。也许是因为 CLR 将 CIL 翻译成机器码需要一些时间?
编辑: 来自 Buffer 类 Buffer.cs 的非常简化的 NET Framework 4.8 方法。我删除了 HAS_CUSTOM_BLOCKS、BIT64、缓冲区重叠的代码......并将数据类型从 byte* 更改为 char*。新添加的代码现在非常接近原始代码,但结果更糟。上面第一个简单的方法曾经是我自己写的,但是现在遇到了库里的专业代码,想换一个。但首先我比较了它们,从那以后我一直想知道为什么新的没有更快。当我的解决方案写入更多的源、目标和长度值时。无论如何,库代码使用了一些技巧,例如从末尾复制。很多人试图避免使用关键字 goto.. :)
[System.Security.SecurityCritical]
[ResourceExposure(ResourceScope.None)]
[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
internal unsafe static void Memmove(char* src, char* dest, int len)
{
//const int CopyThreshold = 1024; //PLATFORM_WINDOWS (2048 bytes)
char* srcEnd = src + len;
char* destEnd = dest + len;
if (len <= 8) goto MCPY02;
if (len > 32) goto MCPY05;
MCPY00:
*(int*)dest = *(int*)src;
*(int*)(dest + 2) = *(int*)(src + 2);
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 6) = *(int*)(src + 6); // [0,16]
if (len <= 16) goto MCPY01;
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 10) = *(int*)(src + 10);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)(dest + 14) = *(int*)(src + 14); // [0,32]
if (len <= 24) goto MCPY01;
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 18) = *(int*)(src + 18);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 22) = *(int*)(src + 22); // [0,48]
MCPY01:
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 6) = *(int*)(srcEnd - 6);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
*(int*)(destEnd - 2) = *(int*)(srcEnd - 2);
return;
MCPY02:
if ((len & 12) == 0) goto MCPY03;
*(int*)dest = *(int*)src;
*(int*)(dest + 2) = *(int*)(src + 2);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
*(int*)(destEnd - 2) = *(int*)(srcEnd - 2);
return;
MCPY03:
if ((len & 2) == 0) goto MCPY04;
*(int*)dest = *(int*)src;
*(int*)(destEnd - 2) = *(int*)(srcEnd - 2);
return;
MCPY04:
if (len == 0) return;
*dest = *src;
return;
MCPY05:
//if (len > CopyThreshold) goto PInvoke; //I don't use so big range
int n = len >> 5;
MCPY06:
*(int*)dest = *(int*)src;
*(int*)(dest + 2) = *(int*)(src + 2);
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 6) = *(int*)(src + 6);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 10) = *(int*)(src + 10);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)(dest + 14) = *(int*)(src + 14);
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 18) = *(int*)(src + 19);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 22) = *(int*)(src + 22);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 26) = *(int*)(src + 26);
*(int*)(dest + 28) = *(int*)(src + 28);
*(int*)(dest + 30) = *(int*)(src + 30);
dest += 32;
src += 32;
n--;
if (n != 0) goto MCPY06;
len %= 32;
if (len > 8) goto MCPY00;
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 6) = *(int*)(srcEnd - 6);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
*(int*)(destEnd - 2) = *(int*)(srcEnd - 2);
return;
}
为了复制大量字节,该类有一个非常快的(至少与托管代码相比)extern C 方法 __Memmove(byte* dest, byte* src, nuint len)。托管内存到托管。 我不知道 Benchmark 是如何工作的,但它的结果非常不稳定。有一次,所有方法几乎相同(20% 的差异),而另外一次则有很大的不同(大约 100%)。在这种情况下,速度非常慢的 Array.Copy 方法是最快的,很多人都写过它有多慢,根据 Stopwatch 的说法最多慢 10 倍。