我想(有效地)找到比距离max_d
更近的所有点对。我目前使用cdist
的方法是:
import numpy as np
from scipy.spatial.distance import cdist
def close_pairs(X,max_d):
d = cdist(X,X)
I,J = (d<max_d).nonzero()
IJ = np.sort(np.vstack((I,J)), axis=0)
# remove diagonal element
IJ = IJ[:,np.diff(IJ,axis=0).ravel()<>0]
# remove duplicate
dt = np.dtype([('i',int),('j',int)])
pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)
return pairs
def test():
X = np.random.rand(100,2)*20
p = close_pairs(X,2)
from matplotlib import pyplot as plt
plt.clf()
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[p,0].T,X[p,1].T,'-b')
但我认为这是过度的(而且不是很易读),因为大部分工作只是为了消除距离自我和重复。
我的主要问题是:有更好的方法吗?
(注意:输出类型(array
,set
,......)此时并不重要)
我目前的想法是使用pdist
返回一个只包含正确对的压缩距离数组。但是,一旦我从压缩距离数组中找到合适的坐标k
,我该如何计算它等同于哪个i,j
对?
所以替代问题是:有一种简单的方法来获取相对于pdist
输出条目的坐标对列表:
f(k)->i,j
cdist(X,X)[i,j] = pdist(X)[k]
答案 0 :(得分:4)
根据我的经验,有两种最快的方法可以在3D中查找邻居列表。一种是使用用C ++或Cython编写的最天真的双循环代码(在我的例子中,两者都是)。它以N ^ 2运行,但对于小型系统来说非常快。另一种方法是使用线性时间算法。 Scipy ckdtree是一个不错的选择,但有局限性。来自分子动力学软件的邻居列表查找器功能最强大,但很难包装,并且初始化时间可能很慢。
下面我比较四种方法:
Scipy.spatial.ckdtree
scipy.spatial.distance.pdist
测试设置:n
点散布在体积密度为0.2的矩形框中。系统大小范围从10到1000000(百万)粒子。联系半径取自0.5, 1, 2, 4, 7, 10
。注意,因为密度是0.2,在接触半径0.5处,我们平均每个颗粒具有约0.1个接触,在1 = 0.8,2 = 6.4,并且在10-约800!对于小系统,重复接触发现几次,对于系统> 30k颗粒进行一次。如果每次通话的时间超过5秒,则运行中止。
设置:双xeon 2687Wv3,128GB RAM,Ubuntu 14.04,python 2.7.11,scipy 0.16.0,numpy 1.10.1。没有一个代码使用并行优化(除了OpenMM,虽然并行部分非常快,以至于在CPU图上甚至都没有注意到,大部分时间都是从OpenMM管道数据)。
结果:请注意,下面的图是logscale,并且分布在6个数量级以上。即使很小的视觉差异实际上也可能是10倍。
对于小于1000个粒子的系统,Cython
代码总是更快。但是,1000个粒子后的结果取决于接触半径。 pdist
实现总是比cython慢,并且需要更多的内存,因为它明确地创建了一个距离矩阵,由于sqrt,它很慢。
ckdtree
是所有系统尺寸的理想选择。 ckdtree
仅执行3-10个时间更糟安装OpenMM非常棘手;您可以在http://bitbucket.org/mirnylab/openmm-polymer文件&#34; contactmaps.py&#34;中阅读更多内容。或在自述文件中。然而,下面的结果表明,对于N> 100k颗粒,每个颗粒仅5-50个接触是有利的。
以下Cython代码:
import numpy as np
cimport numpy as np
cimport cython
cdef extern from "<vector>" namespace "std":
cdef cppclass vector[T]:
cppclass iterator:
T operator*()
iterator operator++()
bint operator==(iterator)
bint operator!=(iterator)
vector()
void push_back(T&)
T& operator[](int)
T& at(int)
iterator begin()
iterator end()
np.import_array() # initialize C API to call PyArray_SimpleNewFromData
cdef public api tonumpyarray(int* data, long long size) with gil:
if not (data and size >= 0): raise ValueError
cdef np.npy_intp dims = size
#NOTE: it doesn't take ownership of `data`. You must free `data` yourself
return np.PyArray_SimpleNewFromData(1, &dims, np.NPY_INT, <void*>data)
@cython.boundscheck(False)
@cython.wraparound(False)
def contactsCython(inArray, cutoff):
inArray = np.asarray(inArray, dtype = np.float64, order = "C")
cdef int N = len(inArray)
cdef np.ndarray[np.double_t, ndim = 2] data = inArray
cdef int j,i
cdef double curdist
cdef double cutoff2 = cutoff * cutoff # IMPORTANT to avoid slow sqrt calculation
cdef vector[int] contacts1
cdef vector[int] contacts2
for i in range(N):
for j in range(i+1, N):
curdist = (data[i,0] - data[j,0]) **2 +(data[i,1] - data[j,1]) **2 + (data[i,2] - data[j,2]) **2
if curdist < cutoff2:
contacts1.push_back(i)
contacts2.push_back(j)
cdef int M = len(contacts1)
cdef np.ndarray[np.int32_t, ndim = 2] contacts = np.zeros((M,2), dtype = np.int32)
for i in range(M):
contacts[i,0] = contacts1[i]
contacts[i,1] = contacts2[i]
return contacts
Cython代码的编译(或makefile):
cython --cplus fastContacts.pyx
g++ -g -march=native -Ofast -fpic -c fastContacts.cpp -o fastContacts.o `python-config --includes`
g++ -g -march=native -Ofast -shared -o fastContacts.so fastContacts.o `python-config --libs`
测试代码:
from __future__ import print_function, division
import signal
import time
from contextlib import contextmanager
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import ckdtree
from scipy.spatial.distance import pdist
from contactmaps import giveContactsOpenMM # remove this unless you have OpenMM and openmm-polymer libraries installed
from fastContacts import contactsCython
class TimeoutException(Exception): pass
@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
matplotlib.rcParams.update({'font.size': 8})
def close_pairs_ckdtree(X, max_d):
tree = ckdtree.cKDTree(X)
pairs = tree.query_pairs(max_d)
return np.array(list(pairs))
def condensed_to_pair_indices(n, k):
x = n - (4. * n ** 2 - 4 * n - 8 * k + 1) ** .5 / 2 - .5
i = x.astype(int)
j = k + i * (i + 3 - 2 * n) / 2 + 1
return np.array([i, j]).T
def close_pairs_pdist(X, max_d):
d = pdist(X)
k = (d < max_d).nonzero()[0]
return condensed_to_pair_indices(X.shape[0], k)
a = np.random.random((100, 3)) * 3 # test set
methods = {"cython": contactsCython, "ckdtree": close_pairs_ckdtree, "OpenMM": giveContactsOpenMM,
"pdist": close_pairs_pdist}
# checking that each method gives the same value
allUniqueInds = []
for ind, method in methods.items():
contacts = method(a, 1)
uniqueInds = contacts[:, 0] + 100 * contacts[:, 1] # unique index of each contacts
allUniqueInds.append(np.sort(uniqueInds)) # adding sorted unique conatcts
for j in allUniqueInds:
assert np.allclose(j, allUniqueInds[0])
# now actually doing testing
repeats = [30,30,30, 30, 30, 20, 20, 10, 5, 3, 2 , 1, 1, 1]
sizes = [10,30,100, 200, 300, 500, 1000, 2000, 3000, 10000, 30000, 100000, 300000, 1000000]
systems = [[np.random.random((n, 3)) * ((n / 0.2) ** 0.333333) for k in range(repeat)] for n, repeat in
zip(sizes, repeats)]
for j, radius in enumerate([0.5, 1, 2, 4, 7, 10]):
plt.subplot(2, 3, j + 1)
plt.title("Radius = {0}; {1:.2f} cont per particle".format(radius, 0.2 * (4 / 3 * np.pi * radius ** 3)))
times = {i: [] for i in methods}
for name, method in methods.items():
for n, system, repeat in zip(sizes, systems, repeats):
if name == "pdist" and n > 30000:
break # memory issues
st = time.time()
try:
with time_limit(5 * repeat):
for ind in range(repeat):
k = len(method(system[ind], radius))
except:
print("Run aborted")
break
end = time.time()
mytime = (end - st) / repeat
times[name].append((n, mytime))
print("{0} radius={1} n={2} time={3} repeat={4} contPerParticle={5}".format(name, radius, n, mytime,repeat, 2 * k / n))
for name in sorted(times.keys()):
plt.plot(*zip(*times[name]), label=name)
plt.xscale("log")
plt.yscale("log")
plt.xlabel("System size")
plt.ylabel("Time (seconds)")
plt.legend(loc=0)
plt.show()
答案 1 :(得分:2)
以下是使用cKDTree模块的方法。见query_pairs
import numpy as np
from scipy.spatial.distance import cdist
from scipy.spatial import ckdtree
def close_pairs(X,max_d):
d = cdist(X,X)
I,J = (d<max_d).nonzero()
IJ = np.sort(np.vstack((I,J)), axis=0)
# remove diagonal element
IJ = IJ[:,np.diff(IJ,axis=0).ravel()<>0]
# remove duplicate
dt = np.dtype([('i',int),('j',int)])
pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)
return pairs
def close_pairs_ckdtree(X, max_d):
tree = ckdtree.cKDTree(X)
pairs = tree.query_pairs(max_d)
return np.array(list(pairs))
def test():
np.random.seed(0)
X = np.random.rand(100,2)*20
p = close_pairs(X,2)
q = close_pairs_ckdtree(X, 2)
from matplotlib import pyplot as plt
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[p,0].T,X[p,1].T,'-b')
plt.figure()
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[q,0].T,X[q,1].T,'-b')
plt.show()
t
答案 2 :(得分:2)
我终于自己找到了。将压缩距离数组中的索引k
转换为方形距离数组中的等效i,j
的函数是:
def condensed_to_pair_indices(n,k):
x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
i = x.astype(int)
j = k+i*(i+3-2*n)/2+1
return i,j
我必须与sympy
玩一点才能找到它。现在,计算所有点对比小于给定距离:
def close_pairs_pdist(X,max_d):
d = pdist(X)
k = (d<max_d).nonzero()[0]
return condensed_to_pair_indices(X.shape[0],k)
答案 3 :(得分:1)
稍微快一些,没有彻底测试时差,但如果我跑了几次,它给我的方法约0.0755529403687的时间,为你的0.0328771495819。我使用triu方法来摆脱数组的上三角形(其中有重复数据),包括对角线(这是自我距离所在的位置),我也不排序,因为如果你绘制它,如果我没关系按顺序绘制它们。所以我猜它加速了大约15%左右
import numpy as np
from scipy.spatial.distance import cdist
from scipy.misc import comb
def close_pairs(X,max_d):
d = cdist(X,X)
I,J = (d<max_d).nonzero()
IJ = np.sort(np.vstack((I,J)), axis=0)
# remove diagonal element
IJ = IJ[:,np.diff(IJ,axis=0).ravel()<>0]
# remove duplicate
dt = np.dtype([('i',int),('j',int)])
pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)
return pairs
def close_pairs1(X,max_d):
d = cdist(X,X)
d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
I,J = (d<max_d).nonzero()
pairs = np.vstack((I,J)).T
return pairs
def close_pairs3(X, max_d):
d = pdist(X)
n = len(X)
pairs = np.zeros((0,2))
for i in range(n):
for j in range(i+1,n):
# formula from http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html
a=d[int(comb(n,2)-comb(n-i,2)+j-i-1+0.1)] # the +0.1 is because otherwise i get floating point trouble
if(a<max_d):
pairs = np.r_[pairs, np.array([i,j])[None,:]]
return pairs
def close_pairs4(X, max_d):
d = pdist(X)
n = len(X)
a = np.where(d<max_d)[0]
i = np.arange(n)[:,None]
j = np.arange(n)[None,:]
b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
d1 = np.tril_indices(n)
b[d1] = -1
pairs = np.zeros((0,2), dtype=int)
# next part is the bottleneck: the np.where each time,
for v in a:
i, j = np.where(v==b)
pairs = np.r_[pairs, np.array([i[0],j[0]])[None,:]]
return pairs
def close_pairs5(X, max_d):
t0=time.time()
d = pdist(X)
n = len(X)
a = np.where(d<max_d)[0]
i = np.arange(n)[:,None]
j = np.arange(n)[None,:]
t1 = time.time()
b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
d1 = np.tril_indices(n)
b[d1] = -1
t2 = time.time()
V = b[:,:,None]-a[None,None,:] # takes a little time
t3 = time.time()
p = np.where(V==0) # takes most of the time, thought that removing the for-loop from the previous method might improve it, but it does not do that much. This method contains the formula you wanted though, but apparently it is still faster if you use the cdist methods
t4 = time.time()
pairs = np.vstack((p[0],p[1])).T
print t4-t3,t3-t2, t2-t1, t1-t0
return pairs
def test():
X = np.random.rand(1000,2)*20
import time
t0 = time.time()
p = close_pairs(X,2)
t1 = time.time()
p2 = close_pairs1(X,2)
t2 = time.time()
print t2-t1, t1-t0
from matplotlib import pyplot as plt
plt.figure()
plt.clf()
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[p,0].T,X[p,1].T,'-b')
plt.figure()
plt.clf()
plt.plot(X[:,0],X[:,1],'.r')
plt.plot(X[p2,0].T,X[p2,1].T,'-b')
plt.show()
test()
注意:如果你做1K点就绘制滞后,但是它需要1K点来比较速度,但我检查它是否正确绘制它如果用100点做它 速度差异大约是10%到20%,我认为它不会比这更好,因为我摆脱了所有排序和独特元素的东西,所以大部分时间占用的部分可能是{{ 1}}行
编辑:更多的测试表明,在那些时候,该cdist线占用大约0.065秒,而你方法的其余部分约为0.02,对我来说约为0.015秒左右。结论:代码的主要瓶颈是d = cdist(X, X)
行,我改变的东西加快了你得到的其余代码,但主要的瓶颈仍然存在
编辑:添加方法close_pairs3,它给你公式,但速度快,(仍然需要弄清楚如何反转该公式,而且它将是超高速,将明天这样做 - 将使用np.where( pdist(X)
编辑:添加方法close_pairs4,稍好于3,并解释会发生什么,但是速度慢,与方法5相同,没有那个for循环,但仍然非常慢< / p>
答案 4 :(得分:0)
我制作了一些代码来比较提出的解决方案。
注意:我使用scipy 0.11并且不能使用ckdtree解决方案(仅限kdtree),我希望它更慢。 scipy v0.12 +的任何人都可以运行此代码吗?
import numpy as np
from scipy.spatial.distance import cdist, pdist
from scipy.spatial import ckdtree
from scipy.spatial import kdtree
def close_pairs(X,max_d):
d = cdist(X,X)
I,J = (d<max_d).nonzero()
IJ = np.sort(np.vstack((I,J)), axis=0)
# remove diagonal element
IJ = IJ[:,np.diff(IJ,axis=0).ravel()<>0]
# remove duplicate
dt = np.dtype([('i',int),('j',int)])
pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)
return pairs
def condensed_to_pair_indices(n,k):
x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
i = x.astype(int)
j = k+i*(i+3-2*n)/2+1
return i,j
def close_pairs_pdist(X,max_d):
d = pdist(X)
k = (d<max_d).nonzero()[0]
return condensed_to_pair_indices(X.shape[0],k)
def close_pairs_triu(X,max_d):
d = cdist(X,X)
d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
I,J = (d<max_d).nonzero()
pairs = np.vstack((I,J)).T
return pairs
def close_pairs_ckdtree(X, max_d):
tree = ckdtree.cKDTree(X)
pairs = tree.query_pairs(max_d)
return pairs # remove the conversion as it is not required
def close_pairs_kdtree(X, max_d):
tree = kdtree.KDTree(X)
pairs = tree.query_pairs(max_d)
return pairs # remove the conversion as it is not required
methods = [close_pairs, close_pairs_pdist, close_pairs_triu, close_pairs_kdtree] #, close_pairs_ckdtree]
def time_test(n=[10,50,100], max_d=[5,10,50], iter_num=100):
import timeit
for method in methods:
print '-- time using ' + method.__name__ + ' ---'
for ni in n:
for d in max_d:
setup = '\n'.join(['import numpy as np','import %s' % __name__,'np.random.seed(0)','X = np.random.rand(%d,2)*100'%ni])
stmt = 'close_pairs.%s(X,%f)' % (method.__name__,d)
time = timeit.timeit(stmt=stmt, setup=setup, number=iter_num)/iter_num
print 'n=%3d, max_d=%2d: \t%.2fms' % (ni, d,time*1000)
time_test(iter_num=10,n=[20,100,500],max_d=[1,5,10])
的输出为:
-- time using close_pairs ---
n= 20, max_d= 1: 0.22ms
n= 20, max_d= 5: 0.16ms
n= 20, max_d=10: 0.21ms
n=100, max_d= 1: 0.41ms
n=100, max_d= 5: 0.53ms
n=100, max_d=10: 0.97ms
n=500, max_d= 1: 7.12ms
n=500, max_d= 5: 12.28ms
n=500, max_d=10: 33.41ms
-- time using close_pairs_pdist ---
n= 20, max_d= 1: 0.11ms
n= 20, max_d= 5: 0.10ms
n= 20, max_d=10: 0.11ms
n=100, max_d= 1: 0.19ms
n=100, max_d= 5: 0.19ms
n=100, max_d=10: 0.19ms
n=500, max_d= 1: 2.31ms
n=500, max_d= 5: 2.82ms
n=500, max_d=10: 2.49ms
-- time using close_pairs_triu ---
n= 20, max_d= 1: 0.17ms
n= 20, max_d= 5: 0.16ms
n= 20, max_d=10: 0.16ms
n=100, max_d= 1: 0.83ms
n=100, max_d= 5: 0.80ms
n=100, max_d=10: 0.80ms
n=500, max_d= 1: 23.64ms
n=500, max_d= 5: 22.87ms
n=500, max_d=10: 22.96ms
-- time using close_pairs_kdtree ---
n= 20, max_d= 1: 1.71ms
n= 20, max_d= 5: 1.69ms
n= 20, max_d=10: 1.96ms
n=100, max_d= 1: 34.99ms
n=100, max_d= 5: 35.47ms
n=100, max_d=10: 34.91ms
n=500, max_d= 1: 253.87ms
n=500, max_d= 5: 255.05ms
n=500, max_d=10: 256.66ms
结论:
但是,需要测试ckdtree方法。