我正在尝试计算两组D维向量之间的成对欧氏距离,并尝试优化以下天真实现的运行时性能。 基本上下面的代码返回描述符(AxD)中每一行之间的成对平方欧几里德距离,并且码本(BxD)中的每一行返回结果(AxB)矩阵。
从python转向基于cython的实现,我看到了一个很好的改进。我是cython的新手,并且想知道是否可以进一步优化此实现以使性能更接近纯c或基于java的实现。
pairwise_dist.pyx
import numpy as np
cimport numpy as np
import math
import cython
np.import_array()
DTYPE = np.float64
ctypedef np.float64_t DTYPE_t
@cython.boundscheck(False)
@cython.wraparound(False)
def _pdist( np.ndarray[DTYPE_t, ndim=2] codebook,
np.ndarray[DTYPE_t, ndim=2] descriptors,
np.ndarray[DTYPE_t, ndim=2] result ):
cdef int i, j, k
cdef int codebook_size = codebook.shape[0]
cdef int n_descriptors = descriptors.shape[0]
cdef int n_dimensions = codebook.shape[1]
cdef float dist, diff
for i in xrange(n_descriptors):
for j in xrange(codebook_size):
dist = 0.0
for k in xrange(n_dimensions):
diff = codebook[j,k] - descriptors[i,k]
dist = dist + diff*diff
result[i,j] = dist
test.py:
import time
import numpy
CODEBOOK_SIZE = 4096
CODEBOOK_DIM = 64
DESC_DIM = 64
DESC_SIZE = 4000
codebook = numpy.random.randint(256,size=(CODEBOOK_SIZE,CODEBOOK_DIM))
descriptors = numpy.random.randint(256,size=(DESC_SIZE,DESC_DIM))
codebook = codebook.astype(numpy.float64)
descriptors = descriptors.astype(numpy.float64)
result = numpy.zeros((descriptors.shape[0],codebook.shape[0]),codebook.dtype)
from pairwise_dist import _pdist
stime = time.time()
_pdist(codebook,descriptors,result)
etime = time.time()
print "Time taken: "+ str(stime-etime)
build.py:
from distutils.core import setup
from Cython.Build import cythonize
setup(
name = "pairwise_dist",
ext_modules = cythonize('pairwise_dist.pyx'),
)
python build_cython.py build_ext --inplace
答案 0 :(得分:1)
您是否出于教育原因使用Cython?否则,我建议使用scipy.spatial模块
import numpy as np
from scipy.spatial.distance import pdist, squareform
object_1 = [0.2, 4.5, 198, 0.003]
object_2 = [0.3, 2.0, 999, 0.001]
object_3 = [0.1, 9.2, 321, 0.023]
list_of_objects = [object_1, object_2, object_3]
# make a 4x3 matrix from list of objects
X = np.array(list_of_objects)
#calculate pairwise distances
distances = pdist(X)
#make a square matrix from result
distances_as_2d_matrix = squareform(distances)
print distances
print distances_as_2d_matrix
pdist
支持多个指标,欧几里德只有一个。