我有以下代码(很抱歉,它不是太少了,我已经尝试将其从原始代码中减少)。
基本上,我在运行eval_s()
的方法/函数时遇到性能问题:
1)用eigvalsh()
找到4x4厄米矩阵的4个特征值
2)将特征值的倒数相加到变量result
3),我对由x,y,z
参数化的许多矩阵重复步骤1和2,将累积总和存储在result
中。
我在步骤3中重复进行计算(查找特征值和求和)的次数取决于代码中的变量ksep
,因此我需要此数字来增加我的实际代码(即{{1 }}必须减少)。
但是ksep
中的计算在eval_s()
上有一个for循环,我想这确实会减慢速度。
[尝试x,y,z
理解我的意思。]
是否有一种方法可以对我的示例代码中指示的方法(或通常涉及查找参数化矩阵特征值的函数)进行矢量化处理?
代码:
ksep=0.5
p.s。代码的sympy部分可能看起来很奇怪,但是在我的原始代码中有其目的。
答案 0 :(得分:3)
您可以,并且方法如下:
def eval_s_vectorized(self, stiff):
assert len(self._qs) == self._q_count, "Run 'populate_qs' first!"
mats = np.stack([self._vfunc(*k) for k in self._qs], axis=0)
evs = np.linalg.eigvalsh(mats)
result = np.sum(np.divide(1., (stiff + evs)))
return result.real - 4 * self._q_count
这仍然使Sympy表达式的计算没有向量化。该部分的矢量化有些棘手,主要是因为输入矩阵中的1
。您可以通过修改Solver
来制作代码的完全向量化版本,以便用vmat
中的数组常量替换标量常量:
import itertools as it
import numpy as np
import sympy as sp
from sympy.abc import x, y, z
from sympy.core.numbers import Number
from sympy.utilities.lambdify import implemented_function
xones = implemented_function('xones', lambda x: np.ones(len(x)))
lfuncs = {'xones': xones}
def vectorizemat(mat):
ret = mat.copy()
# get the first element of the set of symbols that mat uses
for x in mat.free_symbols: break
for i,j in it.product(*(range(s) for s in mat.shape)):
if isinstance(mat[i,j], Number):
ret[i,j] = xones(x) * mat[i,j]
return ret
class Solver:
def __init__(self, vmat):
self._vfunc = sp.lambdify((x, y, z),
expr=vectorizemat(vmat),
modules=[lfuncs, 'numpy'])
self._q_count, self._qs = None, [] # these depend on ksep!
def eval_s_vectorized_completely(self, stiff):
assert len(self._qs) == self._q_count, "Run 'populate_qs' first!"
evs = np.linalg.eigvalsh(self._vfunc(*self._qs.T).T)
result = np.sum(np.divide(1., (stiff + evs)))
return result.real - 4 * self._q_count
def populate_qs(self, ksep: float = 1.7):
self._qs = np.array([(kx, ky, kz) for kx, ky, kz
in it.product(np.arange(-3*np.pi, 3.01*np.pi, ksep),
np.arange(-3*np.pi, 3.01*np.pi, ksep),
np.arange(-3*np.pi, 3.01*np.pi, ksep))])
self._q_count = len(self._qs)
对于小型ksep
,矢量化版本的速度比原始版本快2倍,而完全矢量化版本的速度则快20倍:
# old version for ksep=.3
import timeit
print(timeit.timeit("test()", setup="from __main__ import test", number=10))
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
-85240.46154500882
118.42847006605007
# vectorized version for ksep=.3
import timeit
print(timeit.timeit("test()", setup="from __main__ import test", number=10))
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
64.95763925800566
# completely vectorized version for ksep=.3
import timeit
print(timeit.timeit("test()", setup="from __main__ import test", number=10))
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
5.648927717003971
向量化版本的结果中的舍入误差与原始值略有不同。大概是由于result
中的总和的计算方式不同。
答案 1 :(得分:2)
@tel已完成大部分工作。这是您在20倍的基础上再获得2倍加速的方法。
手动进行线性代数运算。当我尝试震惊时,小矩阵上的numpy多么浪费:
>>> from timeit import timeit
# using eigvalsh
>>> print(timeit("test(False, 0.1)", setup="from __main__ import test", number=3))
-2301206.495955009
-2301206.495955009
-2301206.495955009
55.794611917983275
>>> print(timeit("test(False, 0.3)", setup="from __main__ import test", number=5))
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
-85240.46154498367
3.400342195003759
# by hand
>>> print(timeit("test(True, 0.1)", setup="from __main__ import test", number=3))
-2301206.495955076
-2301206.495955076
-2301206.495955076
26.67294767702697
>>> print(timeit("test(True, 0.3)", setup="from __main__ import test", number=5))
-85240.46154498379
-85240.46154498379
-85240.46154498379
-85240.46154498379
-85240.46154498379
1.5047460949863307
请注意 加速的一部分可能被共享代码掩盖了,仅在线性代数上,它似乎更多,尽管我并没有太认真地检查。
一个警告:我在矩阵的2by2分割上使用Schur补码来计算逆的对角元素。如果不存在Schur补码,即左上或右下子矩阵不可逆,则这将失败。
这是修改后的代码:
import itertools as it
import numpy as np
import sympy as sp
from sympy.abc import x, y, z
from sympy.core.numbers import Number
from sympy.utilities.lambdify import implemented_function
xones = implemented_function('xones', lambda x: np.ones(len(x)))
lfuncs = {'xones': xones}
def vectorizemat(mat):
ret = mat.copy()
for x in mat.free_symbols: break
for i,j in it.product(*(range(s) for s in mat.shape)):
if isinstance(mat[i,j], Number):
ret[i,j] = xones(x) * mat[i,j]
return ret
class Solver:
def __init__(self, vmat):
vmat = vectorizemat(vmat)
self._vfunc = sp.lambdify((x, y, z),
expr=vmat,
modules=[lfuncs, 'numpy'])
self._q_count, self._qs = None, [] # these depend on ksep!
def eval_s_vectorized_completely(self, stiff):
assert len(self._qs) == self._q_count, "Run 'populate_qs' first!"
mats = self._vfunc(*self._qs.T).T
evs = np.linalg.eigvalsh(mats)
result = np.sum(np.divide(1., (stiff + evs)))
return result.real - 4 * self._q_count
def eval_s_pp(self, stiff):
assert len(self._qs) == self._q_count, "Run 'populate_qs' first!"
mats = self._vfunc(*self._qs.T).T
np.einsum('...ii->...i', mats)[...] += stiff
(A, B), (C, D) = mats.reshape(-1, 2, 2, 2, 2).transpose(1, 3, 0, 2, 4)
res = 0
for AA, BB, CC, DD in ((A, B, C, D), (D, C, B, A)):
(a, b), (c, d) = DD.transpose(1, 2, 0)
rdet = 1 / (a*d - b*b)[:, None]
iD = DD[..., ::-1, ::-1].copy()
iD.reshape(-1, 4)[..., 1:3] *= -rdet
np.einsum('...ii->...i', iD)[...] *= rdet
(Aa, Ab), (Ac, Ad) = AA.transpose(1, 2, 0)
(Ba, Bb), (Bc, Bd) = BB.transpose(1, 2, 0)
(Da, Db), (Dc, Dd) = iD.transpose(1, 2, 0)
a = Aa - Ba*Ba*Da - 2*Bb*Ba*Db - Bb*Bb*Dd
d = Ad - Bd*Bd*Dd - 2*Bc*Bd*Db - Bc*Bc*Da
b = Ab - Ba*Bc*Da - Ba*Bd*Db - Bb*Bd*Dd - Bb*Bc*Dc
res += ((a + d) / (a*d - b*b)).sum()
return res - 4 * self._q_count
def populate_qs(self, ksep: float = 1.7):
self._qs = np.array([(kx, ky, kz) for kx, ky, kz
in it.product(np.arange(-3*np.pi, 3.01*np.pi, ksep),
np.arange(-3*np.pi, 3.01*np.pi, ksep),
np.arange(-3*np.pi, 3.01*np.pi, ksep))])
self._q_count = len(self._qs)
def test(manual=False, ksep=0.3):
vmat = sp.Matrix([[1, sp.cos(x/4+y/4), sp.cos(x/4+z/4), sp.cos(y/4+z/4)],
[sp.cos(x/4+y/4), 1, sp.cos(y/4-z/4), sp.cos(x/4 - z/4)],
[sp.cos(x/4+z/4), sp.cos(y/4-z/4), 1, sp.cos(x/4-y/4)],
[sp.cos(y/4+z/4), sp.cos(x/4-z/4), sp.cos(x/4-y/4), 1]]) * 2
solver = Solver(vmat)
solver.populate_qs(ksep=ksep) # <---- Performance starts to worsen (in eval_s) when ksep is reduced!
if manual:
print(solver.eval_s_pp(0.65))
else:
print(solver.eval_s_vectorized_completely(0.65))