为什么第一段代码比第二段要慢得多?
public long Gen(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % mod;
}
public long Gen(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % 12345;
}
gen函数是64位无符号PRNG(请参见下文)。
问题在于,第一段代码的运行速度如此之慢,以至于使用变量来计算模数基本上会使计算随机数所需的时间增加3倍!更令人迷惑的是,当您删除循环并使用变量计算模数时,速度类似于第二段代码。
有些奇怪的事情在这里发生,因为您不能告诉我使用变量的模数的速度是这样的几倍:
public ulong Gen()
{
counter = (counter + 1) & 3;
if (counter == 0)
{
state[0]++;
ulong x0 = state[0];
ulong x1 = state[1];
ulong x2 = state[2];
ulong x3 = state[3];
for (int i = 0; i < 2; i++)
{
x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));
x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));
x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
}
block[0] = x0;
block[1] = x1;
block[2] = x2;
block[3] = x3;
}
return block[counter];
}
要求的最小可复制版本:
using System;
using System.Diagnostics;
class Program
{
static void Main(string[] args)
{
Stopwatch sw = new Stopwatch();
Arx rng = new Arx();
long a = 0;
// constant = fast
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenConstant(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
// no loop = fast
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenNoLoop(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
// modulus variable = slow
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenVariable(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
}
}
class Arx
{
static public ulong[] state = new ulong[4];
static public ulong[] outBlock = new ulong[4];
static int counter = -1;
public Arx(ulong seed = 0)
{
if (seed == 0)
state[1] = (ulong)DateTime.UtcNow.Ticks;
else
state[1] = seed;
}
public ulong Gen()
{
counter = (counter + 1) & 3;
if (counter == 0)
{
state[0]++;
ulong x0 = state[0];
ulong x1 = state[1];
ulong x2 = state[2];
ulong x3 = state[3];
for (int i = 0; i < 2; i++)
{
x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));
x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));
x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
}
outBlock[0] = x0;
outBlock[1] = x1;
outBlock[2] = x2;
outBlock[3] = x3;
}
return outBlock[counter];
}
public long GenConstant(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % 12345;
}
public long GenVariable(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % mod;
}
public long GenNoLoop(int mod)
{
long a = (long)(Gen() >> 1);
return a % mod;
}
}
答案 0 :(得分:1)
这是一个优化程序问题。
首先毫无疑问,使用变量要比使用常量慢,因为加载变量需要更多时间。
但是,当您删除循环部分时,方法变得很简单,优化器将它们内联。并且请注意,当方法为内联时,可以将a % mod
中的表达式rng.GenNoLoop(123)
识别为常量。因此,它们现在是相同的。
要恢复未优化状态,您需要将实变量传递给GenNoLoop
。
static int mod = 123;
static void Main(string[] args)
{
rng.GenNoLoop(mod);
}
另一种选择是强制该方法无内联
[MethodImpl(MethodImplOptions.NoInlining)]
public long GenNoLoop(int mod)
答案 1 :(得分:0)
我使用此代码来测试两种方法的速度:
void Main()
{
Stopwatch sw = new Stopwatch();
var ts1 = TimeSpan.Zero;
var ts2 = TimeSpan.Zero;
Arx rng = new Arx();
for (var x = 0; x < 1000; x++)
{
long a = 0;
sw.Start();
for (int i = 0; i < 100000; i++)
{
a += rng.GenVariable(123);
}
sw.Stop();
ts1 += sw.Elapsed;
sw.Reset();
a = 0;
sw.Start();
for (int i = 0; i < 100000; i++)
{
a += rng.GenConstant(123);
}
sw.Stop();
ts2 += sw.Elapsed;
sw.Reset();
}
ts1.TotalMilliseconds.Dump();
ts2.TotalMilliseconds.Dump();
}
我分别获得了2890.5391
和2805.8824
毫秒。可变版本仅慢3%。没什么大不同。