在使用Math.Round将double转换为int时,我注意到性能下降非常显着(~15x),同时针对x64而不是x86。我在Core i7 3770K上的64位Windows上进行了测试。任何人都可以复制它吗?有这么好的理由吗?也许是一些奇怪的边界条件?
仅供参考,我将Math.Round
(Test1)与2个近似值进行了比较:条件转换(Test2)和6755399441055744技巧(Test3)。
运行时间是:
---------------------------
| | x86 | x64 |
|-------+--------+--------|
| Test1 | 0,0662 | 0,9975 |
| Test2 | 0,1517 | 0,1513 |
| Test3 | 0,1966 | 0,0978 |
---------------------------
以下是基准代码:
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
namespace MathRoundTester
{
class Program
{
private const int IterationCount = 1000000;
private static int dummy;
static void Main(string[] args)
{
var data = new double[100];
var rand = new Random(0);
for (int i = 0; i < data.Length; ++i)
{
data[i] = rand.NextDouble() * int.MaxValue * 2 +
int.MinValue + rand.NextDouble();
}
dummy ^= Test1(data);
dummy ^= Test2(data);
dummy ^= Test3(data);
RecordTime(data, Test1);
RecordTime(data, Test2);
RecordTime(data, Test3);
Console.WriteLine(dummy);
Console.Read();
}
private static void RecordTime(double[] data, Func<double[], int> action)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
var sw = Stopwatch.StartNew();
dummy ^= action(data);
sw.Stop();
Console.WriteLine((sw.ElapsedTicks / (double)Stopwatch.Frequency).ToString("F4"));
}
private static int Test1(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= (int)Math.Round(x);
}
}
return d;
}
private static int Test2(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= x > 0 ? (int)(x + 0.5) : (int)(x - 0.5);
}
}
return d;
}
[StructLayout(LayoutKind.Explicit)]
private struct DoubleIntUnion
{
public DoubleIntUnion(double a)
{
Int = 0;
Double = a;
}
[FieldOffset(0)]
public double Double;
[FieldOffset(0)]
public int Int;
}
private static int Test3(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= new DoubleIntUnion(x + 6755399441055744.0).Int;
}
}
return d;
}
}
}
2016-11-23更新:
在AndreyAkinshin在dotnet / coreclr repo上发布question后,有一段时间,它被添加到了1.2.0里程碑。所以看来这个问题只是一个疏忽而且会被修复。
答案 0 :(得分:9)
让我们看一下(int) Math.Round(data[j])
的主题。
LegacyJIT 86:
01172EB0 fld qword ptr [eax+edi*8+8]
01172EB4 fistp dword ptr [ebp-14h]
RyuJIT-64:
`d7350617 c4e17b1044d010 vmovsd xmm0,qword ptr [rax+rdx*8+10h]
`d735061e e83dce605f call clr!COMDouble::Round (`3695d460)
`d7350623 c4e17b2ce8 vcvttsd2si ebp,xmm0
clr!COMDouble::Round
的来源:
clr!COMDouble::Round:
`3695d460 4883ec58 sub rsp,58h
`3695d464 0f29742440 movaps xmmword ptr [rsp+40h],xmm6
`3695d469 0f57c9 xorps xmm1,xmm1
`3695d46c f2480f2cc0 cvttsd2si rax,xmm0
`3695d471 0f297c2430 movaps xmmword ptr [rsp+30h],xmm7
`3695d476 0f28f0 movaps xmm6,xmm0
`3695d479 440f29442420 movaps xmmword ptr [rsp+20h],xmm8
`3695d47f f2480f2ac8 cvtsi2sd xmm1,rax
`3695d484 660f2ec1 ucomisd xmm0,xmm1
`3695d488 7a17 jp clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48a 7515 jne clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48c 0f28742440 movaps xmm6,xmmword ptr [rsp+40h]
`3695d491 0f287c2430 movaps xmm7,xmmword ptr [rsp+30h]
`3695d496 440f28442420 movaps xmm8,xmmword ptr [rsp+20h]
`3695d49c 4883c458 add rsp,58h
`3695d4a0 c3 ret
`3695d4a1 440f28c0 movaps xmm8,xmm0
`3695d4a5 f2440f5805c23a7100
addsd xmm8,mmword ptr [clr!_real (`37070f70)] ds:`37070f70=3fe0000000000000
`3695d4ae 410f28c0 movaps xmm0,xmm8
`3695d4b2 e821000000 call clr!floor (`3695d4d8)
`3695d4b7 66410f2ec0 ucomisd xmm0,xmm8
`3695d4bc 0f28f8 movaps xmm7,xmm0
`3695d4bf 7a06 jp clr!COMDouble::Round+0x67 (`3695d4c7)
`3695d4c1 0f8465af3c00 je clr! ?? ::FNODOBFM::`string'+0xdd8c4 (`36d2842c)
`3695d4c7 0f28ce movaps xmm1,xmm6
`3695d4ca 0f28c7 movaps xmm0,xmm7
`3695d4cd ff1505067000 call qword ptr [clr!_imp__copysign (`3705dad8)]
`3695d4d3 ebb7 jmp clr!COMDouble::Round+0x2c (`3695d48c)
如您所见,LegacyJIT-x86使用极速fld
- fistp
对;根据{{3}},我们对Haswell有以下数字:
Instruction | Latency | Reciprocal throughput
------------|---------|----------------------
FLD m32/64 | 3 | 0.5
FIST(P) m | 7 | 1
RyuJIT-x64直接调用clr!COMDouble::Round
(LegacyJIT-x64也是如此)。您可以在Instruction tables by Agner Fog repo中找到此方法的源代码。如果您使用release-1.0.0
,则需要dotnet/coreclr:
#if defined(_TARGET_X86_)
__declspec(naked)
double __fastcall COMDouble::Round(double d)
{
LIMITED_METHOD_CONTRACT;
__asm {
fld QWORD PTR [ESP+4]
frndint
ret 8
}
}
#else // !defined(_TARGET_X86_)
FCIMPL1_V(double, COMDouble::Round, double d)
FCALL_CONTRACT;
double tempVal;
double flrTempVal;
// If the number has no fractional part do nothing
// This shortcut is necessary to workaround precision loss in borderline cases on some platforms
if ( d == (double)(__int64)d )
return d;
tempVal = (d+0.5);
//We had a number that was equally close to 2 integers.
//We need to return the even one.
flrTempVal = floor(tempVal);
if (flrTempVal==tempVal) {
if (0 != fmod(tempVal, 2.0)) {
flrTempVal -= 1.0;
}
}
flrTempVal = _copysign(flrTempVal, d);
return flrTempVal;
FCIMPLEND
#endif // defined(_TARGET_X86_)
如果您正在使用主分支,则可以在floatnative.cpp中找到类似的代码。
FCIMPL1_V(double, COMDouble::Round, double x)
FCALL_CONTRACT;
// If the number has no fractional part do nothing
// This shortcut is necessary to workaround precision loss in borderline cases on some platforms
if (x == (double)((INT64)x)) {
return x;
}
// We had a number that was equally close to 2 integers.
// We need to return the even one.
double tempVal = (x + 0.5);
double flrTempVal = floor(tempVal);
if ((flrTempVal == tempVal) && (fmod(tempVal, 2.0) != 0)) {
flrTempVal -= 1.0;
}
return _copysign(flrTempVal, x);
FCIMPLEND
似乎完整的.NET Framework使用相同的逻辑。
因此,由于不同JIT编译器的内部实现存在差异,(int)Math.Round
在x86
上的效果确实比在x64
上更快。请注意,将来可以更改此行为。
顺便说一句,你可以在BenchmarkDotNet的帮助下编写一个小而可靠的基准:
[LegacyJitX86Job, LegacyJitX64Job, RyuJitX64Job]
public class MathRoundBenchmarks
{
private const int N = 100;
private double[] data;
[Setup]
public void Setup()
{
var rand = new Random(0);
data = new double[N];
for (int i = 0; i < data.Length; ++i)
{
data[i] = rand.NextDouble() * int.MaxValue * 2 +
int.MinValue + rand.NextDouble();
}
}
[Benchmark(OperationsPerInvoke = N)]
public int MathRound()
{
int d = 0;
for (int i = 0; i < data.Length; ++i)
d ^= (int) Math.Round(data[i]);
return d;
}
}
结果:
BenchmarkDotNet.Core=v0.9.9.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i7-4702MQ CPU 2.20GHz, ProcessorCount=8
Frequency=2143475 ticks, Resolution=466.5321 ns, Timer=TSC
CLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
GC=Concurrent Workstation
JitModules=clrjit-v4.6.1586.0
Type=MathRoundBenchmarks Mode=Throughput
Method | Platform | Jit | Median | StdDev |
---------- |--------- |---------- |----------- |---------- |
MathRound | X64 | LegacyJit | 12.8640 ns | 0.2796 ns |
MathRound | X64 | RyuJit | 13.4390 ns | 0.4365 ns |
MathRound | X86 | LegacyJit | 1.0278 ns | 0.0373 ns |
答案 1 :(得分:0)
不是这样的答案,而是其他人可能会在x64系统的性能关键区域找到有用的代码,具体取决于精确的舍入要求。
100000000次操作的以ms为单位的性能时间为:
Round(x): 1112
Round(x,y): 2183
FastMath.Round(x): 155
FastMath.Round(x,y): 519
代码:
public static class FastMath
{
private static readonly double[] RoundLookup = CreateRoundLookup();
private static double[] CreateRoundLookup()
{
double[] result = new double[15];
for (int i = 0; i < result.Length; i++)
{
result[i] = Math.Pow(10, i);
}
return result;
}
public static double Round(double value)
{
return Math.Floor(value + 0.5);
}
public static double Round(double value, int decimalPlaces)
{
double adjustment = RoundLookup[decimalPlaces];
return Math.Floor(value * adjustment + 0.5) / adjustment;
}
}