Question

作为在cython中重写我的游戏引擎的一部分，我正在尝试提高我的python + numpy类的矩阵和向量数学的性能，因为这是我之前遇到的主要瓶颈之一。这组模块为Vector2/3/4，Matrix2/3/4和Quaternion等类型定义了类。

从glMatrix javascript library获取页面，我认为这次我可以做的一件事就是从基于类的系统切换到只有一堆数学函数的模块，以减少更多的开销。这样，每次我添加两个向量，而不是返回一个新对象，我就不必构造一个自定义对象。

为了测试这一点，我编写了一个基准测试演示，用于创建两个Vec2个对象a和b，将它们组合在一起以获得Vec2个对象{{1 }}。其代码分为out执行计时，main.py表示cython代码，vec2.pyx表示python代码。以下是每个组件的代码：

main.py

pyvec2.py

vec2.pyx

import time
import array
import math3d.vec2 as vec2
import math3d.pyvec2 as pyvec2

def test(n, func, param_list):
    start = time.time()
    for i in range(n):
        func(*param_list)
    end = time.time()
    print func, end-start

test(1000000, pyvec2.pyadd, [[1, 2], [3, 4]])
test(1000000, pyvec2.pyadd2, [[0, 0], [1, 2], [3, 4]])
test(1000000, vec2.add, [[1, 2], [3, 4]])
test(1000000, vec2.add2, [array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add3, [array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add4, [array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add5, [[0, 0], [1, 2], [3, 4]])
test(1000000, vec2.add6, [array.array("f", [0, 0]), array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add7, [array.array("f", [0, 0]), array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add8, [array.array("f", [0, 0]), array.array("f", [1, 2]), array.array("f", [3, 4])])
test(1000000, vec2.add9, [[0, 0], [1, 2], [3, 4]])

pyvec2.py

from libc.stdlib cimport malloc, free
from cpython cimport array
import array

def add(list a, list b):
    cdef float[2] out = [0, 0]
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add2(float[:] a, float[:] b):
    cdef float[2] out = [0, 0]
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add3(array.array a, array.array b):
    cdef float[2] out = [0, 0]
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add4(array.array a, array.array b):
    cdef array.array out = array.array("f", [0, 0])
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add5(list out, list a, list b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add6(float[:] out, float[:] a, float[:] b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add7(array.array out, array.array a, array.array b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add8(array.array out, array.array a, array.array b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

def add9(out, a, b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

运行def pyadd(a, b): out = [a[0] + b[0], a[1] + b[1]] def pyadd2(out, a, b): out[0] = a[0] + b[0] out[1] = a[1] + b[1] return out后的结果：

main.py

从这看起来，只是使用python列表比键入数组更快！此外，只是盲目地编译我的python函数而不键入cython产生了最好的结果！看起来执行代码所花费的时间很大一部分用于转换为python类型和从python类型转换。

因此我很想知道是否有更快的方法在cython端执行数学运算，同时最大限度地减少传递参数的python开销。我并不真正感兴趣的是必须公开list或array.array对象以直接访问我的向量或矩阵的内容。将python的指针传递给我的cython数学模块是理想的，但这似乎不可能，因为指针不是python对象。任何建议将不胜感激。

更新

以下是我的<function pyadd at 0x0000000003354828> 0.380000114441 <function pyadd2 at 0x0000000003354908> 0.31299996376 <built-in function add> 0.261000156403 <built-in function add2> 0.680999994278 <built-in function add3> 0.268000125885 <built-in function add4> 0.601000070572 <built-in function add5> 0.144999980927 <built-in function add6> 1.06299996376 <built-in function add7> 0.241000175476 <built-in function add8> 0.237999916077 <built-in function add9> 0.141000032425课程的代码。它由两个文件组成：第一个是基类Vec2类，第二个文件从它继承为特定的_Vec类。

vec.py

Vec2

import math import numpy as np import random class _Vec(object): def __init__(self, *args): try: data, = args data = np.array(data, dtype=np.float32) cls_name = self.__class__.__name__ vec2_check = cls_name == "Vec2" and len(data) != 2 vec3_check = cls_name == "Vec3" and len(data) != 3 vec4_check = cls_name == "Vec4" and len(data) != 4 if any([vec2_check, vec3_check, vec4_check]) == True: raise TypeError("{0} is not a valid {1}".format(data, cls_name)) except ValueError: data = np.array(args, dtype=np.float32) self._data = data def __add__(self, other): if isinstance(other, self.__class__): return self.__class__(self._data + other._data) return self.__class__(self._data + other) def __radd__(self, other): return self.__class__(other + self._data) def __sub__(self, other): if isinstance(other, self.__class__): return self.__class__(self._data - other._data) return self.__class__(self._data - other) def __rsub__(self, other): return self.__class__(other - self._data) def __mul__(self, other): if isinstance(other, self.__class__): return self.__class__(self._data * other._data) return self.__class__(self._data * other) def __rmul__(self, other): return self.__class__(other * self._data) def __div__(self, other): if isinstance(other, self.__class__): return self.__class__(self._data / other._data) return self.__class__(self._data / other) def __rdiv__(self, other): return self.__class__(other / self._data) def __neg__(self): return self.__class__(-self._data) def __pos__(self): return self.__class__(+self._data) def __eq__(self, other): return np.array_equal(self._data, other._data) def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): return self.square_length() < other.square_length() def __le__(self, other): return self.square_length() <= other.square_length() def __gt__(self, other): return self.square_length() > other.square_length() def __ge__(self, other): return self.square_length() >= other.square_length() def __repr__(self): return "{0}(data={1})".format(self.__class__.__name__, self.get_data()) def __str__(self): return np.array_str(self._data) def ceil(self): return self.__class__(np.ceil(self._data)) def floor(self): return self.__class__(np.floor(self._data)) def get_data(self): return self._data.flatten().tolist() def inverse(self): return self.__class__(1.0/self._data) def length(self): return float(np.linalg.norm(self._data)) def negate(self): return self.__class__(-self._data) def normalize(self): length = self.length() if length == 0.0: return self.__class__(np.zeros(self._data.shape())) return self.__class__(self._data/length) def round(self, decimal=0): return self.__class__(np.round(self._data, decimal)) def square_length(self): return float(np.sum(np.square(self._data))) @classmethod def distance(cls, a, b): c = b - a return c.length() @classmethod def dot(cls, a, b): return float(np.dot(a._data, b._data)) @classmethod def equals(cls, a, b, tolerance=0.0): diffs = np.fabs((a - b)._data) pairs = zip(list(np.fabs(a._data)), list(np.fabs(b._data))) tolerance_calcs = [tolerance * max(1, a_val, b_val) for (a_val, b_val) in pairs] tests = [d <= t for (d, t) in zip(diffs, tolerance_calcs)] return all(tests) @classmethod def lerp(cls, a, b, t): return a*(1-t) + b*t @classmethod def max_components(cls, a, b): return cls(np.maximum(a._data, b._data)) @classmethod def min_components(cls, a, b): return cls(np.minimum(a._data, b._data)) @classmethod def random(cls, n): return cls(np.random.rand((n))) @classmethod def square_distance(cls, a, b): c = b - a return c.square_length()

vec2.py

更新2：

正如kazemakase指出的那样，from vec import _Vec from vec3 import Vec3 from vec4 import Vec4 import math import numpy as np import random class Vec2(_Vec): @property def x(self): return float(self._data[0]) @x.setter def x(self, value): self._data[0] = float(value) @property def y(self): return float(self._data[1]) @y.setter def y(self, value): self._data[1] = float(value) def __repr__(self): return "Vec2(x={0}, y={1})".format(self.x, self.y) def transform_mat2(self, a): prod = np.dot(a._data.T, self._data.T).T return Vec2(prod) def transform_mat3(self, a): v3 = Vec3(self.get_data() + [1]) prod = np.dot(a._data.T, v3._data.T).T return Vec2(prod[0:2]) def transform_mat4(self, a): v4 = Vec4(self.get_data() + [0, 1]) prod = np.dot(a._data.T, v4._data.T).T return Vec2(prod[0:2]) @classmethod def random(cls): return super(Vec2, cls).random(2)中的一些值是整数。通过附加.0将所有内容定义为浮点数，我得到了以下时间：

main.py

这些似乎与原始时间相似，在这种情况下，5和9更快。

更新3：

正如BrenBarn指出的那样，解释我如何使用我原来的python + numpy类以及为什么我在努力表现它可能会很有用。最初，我的整个3D游戏库项目都采用纯python，使用PyOpenGL渲染图形。为了在我的3d世界中定位网格/模型，每一帧，我需要计算一个Mat4变换矩阵，该矩阵定义该对象在世界中的位置，旋转和缩放，然后上传到GPU。当我定位许多对象（> 1000）时，我的应用程序中的帧速率会变慢。虽然暂时禁用3D确实提高了性能，但我的应用程序仍然落后于60 fps。那时我才意识到简单地计算python中的<function pyadd at 0x0000000002FF4828> 0.384000062943 <function pyadd2 at 0x0000000002FF4908> 0.332000017166 <built-in function add> 0.227999925613 <built-in function add2> 0.640000104904 <built-in function add3> 0.258999824524 <built-in function add4> 0.556999921799 <built-in function add5> 0.145999908447 <built-in function add6> 0.983999967575 <built-in function add7> 0.217000007629 <built-in function add8> 0.236000061035 <built-in function add9> 0.131000041962矩阵是罪魁祸首。当我删除这些计算并刚刚在原点绘制了所有的3d对象时，性能又恢复了。

然后想知道我的所有math3d库是否都很慢，我想我会开始对所有这些类进行基准测试。我为Mat4类添加函数做了以下基准测试，因为它是测试中最简单，计算最便宜的函数：

Vec2

这些评论显示我刚刚加入两个import time from vec2 import Vec2 def add(a, b): out = a + b return out def test(n, func, param_list): start = time.time() for i in range(n): func(*param_list) end = time.time() print func, end-start test(1000000, add, [Vec2([1.0, 2.0]), Vec2([3.0, 4.0])]) #<function add at 0x00000000041B1CF8> 2.81699991226 (using the real def add(a, b) function) #<function add at 0x000000000362CCF8> 0.168999910355 (just passing in values to def add(a, b): pass)的时间。从那以后，我得出结论，数学很慢，有很多python函数调用+类开销，这是我的3d库中需要考虑的性能瓶颈。我希望这为这个问题提供了一些理由。

更新4：

Paul Cornelius所说的只有一次从python到cython的翻译惩罚让我思考：为什么不只是拿一个＆＃34;指针＆＃34;每当我创建一个＆＃34; Vec2对象时，到Vec2对象＆＃34;？然后，我可以为未来的数学运算传递指针，cython可以取消引用那些指针来获取实际数据，并且可以执行数学运算。结果如下：

main.py

float *

vec2.pyx

import time
import array
import math3d.vec2 as vec2

def make_list(a, b):
    out = [a, b]

def test(n, func, param_list):
    start = time.time()
    for i in range(n):
        func(*param_list)
    end = time.time()
    print func, end-start

test(1000000, vec2.create, [1, 2])
test(1000000, make_list, [1, 2])
a = vec2.create(1, 2)
#b = vec2.get_data(a)
b = vec2.create(3, 4)
c = vec2.create(0, 0)
test(1000000, vec2.add, [c, a, b])
test(1000000, vec2.add2, [[0, 0], [1, 2], [3, 4]])
print vec2.get_data(c)

时间安排

def add(uintptr_t out, uintptr_t a, uintptr_t b):
    cdef float *a_data = <float *>a
    cdef float *b_data = <float *>b
    cdef float *out_data = <float *>out
    out_data[0] = a_data[0] + b_data[0]
    out_data[1] = a_data[1] + b_data[1]

def create(float x, float y):
    cdef float* a = <float *>malloc(sizeof(float))
    a[:] = [x, y]
    cdef uintptr_t u_ptr = <uintptr_t> a
    return u_ptr

def get_data(uintptr_t u_ptr):
    cdef float *b = <float *>u_ptr
    return b[0], b[1]

def add2(out, a, b):
    out[0] = a[0] + b[0]
    out[1] = a[1] + b[1]
    return out

当然，在python中处理<built-in function create> 0.19000005722 <function make_list at 0x0000000002994828> 0.269999980927 <built-in function add> 0.111000061035 <built-in function add2> 0.141999959946 (4.0, 6.0)整数是非常不安全的，因为它们可能会错误地在python端与uintptr_t运算符一起添加。此外，目前尚不清楚这种轻微的性能优势（100万次操作的0.03秒）是否真的值得。

Answer 1

Cython的优势在于基本C数据类型（整数，浮点数，双精度数）和数组的算术运算。代码中唯一的算术运算是两个简单的添加。其余的是数据类型转换和数组元素访问。正如您的时间结果所示，这些操作肯定会占主导地位。每次执行从Python到Cython的函数调用时，都会有类型检查和转换开销。你需要在这个障碍的Cython一侧进行足够的数字运算才能让它变得有价值。

这不是Cython的一个很好的用例。

添加很多双元素列表真的是应用程序的瓶颈吗？如果是，您应该考虑将所有数据存储在Cython变量中，只会产生一次翻译惩罚。只有在必要时才将数据移动到Python端。如果不是，则需要创建更实际的测试。

我的经验是Cython通常可以匹配甚至超过numpy，尽管它需要花费一些精力进行优化。（numpy / scipy当然具有提供比我在一百次生命中创造的更多功能的“轻微”优势）。但是同样也必须将Python转换为C数据类型的过程相同。

在cython中编写快速的3d矩阵和矢量库

1 个答案: