我正在比较这个F#功能的表现:
let e28 N =
seq {for i in 2L..2L..N do for j in 1..4 -> i} |> Seq.scan (+) 1L |> Seq.sum
使用Python 3.3等价物:
def e28a(N = 100000):
diagNumber = 1
sum = diagNumber
for width in range(2, N+1, 2):
for j in range(4):
diagNumber += width
sum += diagNumber
return sum
import itertools as it
def e28b(N = 100000):
return sum(it.accumulate(it.chain([1], (i for i in range(2, N+1, 2) for j in range(4)))))
import numpy as np
def e28c(N = 100000):
return np.sum(np.cumsum(np.fromiter(chain([1], (i for i in range(2, N+1, 2) for j in range(4))), np.int64)))
我在Windows 7上获得的64位CPython 3.3.1性能比C ++慢约574倍。以下是N = 100000的时间:
e28:23ms; e28a:48.4ms; e28b:49.7ms; e28c:40.2ms; C ++版本:0.07ms在不改变基础算法的情况下优化Python代码是否有一个很低的成果?
答案 0 :(得分:4)
通过切换到一个程序性的,可变的方法(比如你的python e28a
),F#版本可以加速~10倍。当“有效载荷操作”(在这种情况下,只是+)是如此微不足道时,组合器的使用最终会增加相对显着的开销。作为旁注,Seq.sum
使用了检查算术,这也增加了一些开销。
F#的一个好处是,如果需要一个完整的热门路径,你可以回归到程序/可变的风格。
let e28_original N =
seq {
for i in 2UL..2UL..N do
for j in 1..4 do
yield i
}
|> Seq.scan (+) 1UL
|> Seq.sum
let e28_mutable N =
let mutable sum = 1UL
let mutable total = sum
for i in 2UL..2UL..N do
for j in 1..4 do
sum <- sum + i
total <- total + sum
total
let time f =
f () |> ignore // allow for warmup / JIT
let sw = System.Diagnostics.Stopwatch.StartNew()
let result = f ()
sw.Stop()
printfn "Result: %A Elapsed: %A" result sw.Elapsed
time (fun _ -> e28_original 100000UL)
time (fun _ -> e28_mutable 100000UL)
结果
Result: 666691667100001UL Elapsed: 00:00:00.0429414
Result: 666691667100001UL Elapsed: 00:00:00.0034971
答案 1 :(得分:3)
使用你的F#版本我得到了:
> e28(100000L);;
Real: 00:00:00.061, CPU: 00:00:00.062, GC gen0: 2, gen1: 0, gen2: 0
val it : int64 = 666691667100001L
使用:
let e28d N =
seq {2L..2L..N}
|> Seq.collect(fun x->seq{yield x;yield x; yield x; yield x})
|> Seq.scan (+) 1L
|> Seq.sum
我得到了:
> e28d(100000L);;
Real: 00:00:00.040, CPU: 00:00:00.031, GC gen0: 2, gen1: 0, gen2: 0
val it : int64 = 666691667100001L
由于编译了F#并解释了Python,因此您可能很难让python的表现与F#完全相同。话虽如此,上述改进也适用于python:
>>> def e28a(N = 100000):
diagNumber = 1;
sum = diagNumber;
for width in range(2, N+1, 2):
for j in range(4):
diagNumber += width;
sum += diagNumber;
return sum;
>>> if __name__ == '__main__':
import timeit
print(timeit.timeit("e28a()", setup="from __main__ import e28a", number=10))
0.5249497228663813
>>> def e28a(N = 100000):
diagNumber = 1;
sum = diagNumber;
for width in range(2, N+1, 2):
diagNumber += width;
sum += diagNumber;
diagNumber += width;
sum += diagNumber;
diagNumber += width;
sum += diagNumber;
diagNumber += width;
sum += diagNumber;
return sum;
>>> if __name__ == '__main__':
import timeit
print(timeit.timeit("e28a()", setup="from __main__ import e28a", number=10))
0.2585966329330063
>>>
部分改进来自较少的函数调用,即:
>>> def e28a(N = 100000):
diagNumber = 1;
sum = diagNumber;
temp_range = range(4) #Change here
for width in range(2, N+1, 2):
for j in temp_range: #Change here
diagNumber += width;
sum += diagNumber;
return sum;
>>> if __name__ == '__main__':
import timeit
print(timeit.timeit("e28a()", setup="from __main__ import e28a", number=10))
0.40251470339956086
>>>
我认为另一部分来自删除循环。在Python中,这两者都相当昂贵。
答案 2 :(得分:1)
这几乎是我机器的两倍。它使用了memoization,也使用了基本的算术推导。
您必须定义一个全局变量。
summi=2
def e28d(N = 100000):
def memo(width):
global summi
summi+=width*4+4
return summi-width*2+2
x= sum((memo(width*4)) for width in range (2, N+1, 2))+1
return x
结果:
e28a:
0.0591201782227秒
e28d:
0.0349650382996秒
希望它至少是建设性的。注意:您必须根据数字是否为奇数对其进行调制。
<强>更新强> 这是一个在python中运行速度快一百倍的函数(N = 100000时大约0.5 ms),完全避免循环:
import math
def e28e(X = 100000):
keyint, keybool=int(X/6), X%6
if keybool/2==0: keyvar=(16*keyint+sum(range(keyint))*12)
elif keybool/2==1: keyvar=(44*keyint+sum(range(keyint))*36+7)
else: keyvar=(28*(keyint+1)+sum(range(keyint+1))*60-2)
X-=keybool%2
diag= math.pow(X,2)+2*X+1
newvar=keyint+int(X/2)+1
summ= int(diag*newvar+keyvar)
return summ