import numpy as np
from scipy.spatial.distance import cdist

def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def test():
    X = np.random.rand(100,2)*20
    p = close_pairs(X,2)

    from matplotlib import pyplot as plt






  • 一个函数f(k)->i,j
  • 这样cdist(X,X)[i,j] = pdist(X)[k]

5 个答案:

根据我的经验,有两种最快的方法可以在3D中查找邻居列表。一种是使用用C ++或Cython编写的最天真的双循环代码(在我的例子中,两者都是)。它以N ^ 2运行,但对于小型系统来说非常快。另一种方法是使用线性时间算法。 Scipy ckdtree是一个不错的选择,但有局限性。来自分子动力学软件的邻居列表查找器功能最强大,但很难包装,并且初始化时间可能很慢。


  • 天真的cython代码
  • OpenMM周围的包装(非常难以安装,见下文)
  • Scipy.spatial.ckdtree
  • scipy.spatial.distance.pdist

测试设置:n点散布在体积密度为0.2的矩形框中。系统大小范围从10到1000000(百万)粒子。联系半径取自0.5, 1, 2, 4, 7, 10。注意,因为密度是0.2,在接触半径0.5处,我们平均每个颗粒具有约0.1个接触,在1 = 0.8,2 = 6.4,并且在10-约800!对于小系统,重复接触发现几次,对于系统> 30k颗粒进行一次。如果每次通话的时间超过5秒,则运行中止。

设置:双xeon 2687Wv3,128GB RAM,Ubuntu 14.04,python 2.7.11,scipy 0.16.0,numpy 1.10.1。没有一个代码使用并行优化(除了OpenMM,虽然并行部分非常快,以至于在CPU图上甚至都没有注意到,大部分时间都是从OpenMM管道数据)。

结果:请注意,下面的图是logscale,并且分布在6个数量级以上。即使很小的视觉差异实际上也可能是10倍。 对于小于1000个粒子的系统,Cython代码总是更快。但是,1000个粒子后的结果取决于接触半径。 pdist实现总是比cython慢​​,并且需要更多的内存,因为它明确地创建了一个距离矩阵,由于sqrt,它很慢。

  • 在小接触半径(每个粒子接触1次)时,ckdtree是所有系统尺寸的理想选择。
  • 在中等接触半径,(每个粒子5-50个接触点)天真的cython实现是最好的10000个粒子,然后OpenMM开始赢得大约几个数量级,但ckdtree仅执行3-10个时间更糟
  • 在高接触半径(每个粒子接触200个接触点)时,天然方法可以处理高达100k或1M的粒子,然后OpenMM可能会获胜。

安装OpenMM非常棘手;您可以在http://bitbucket.org/mirnylab/openmm-polymer文件&#34; contactmaps.py&#34;中阅读更多内容。或在自述文件中。然而,下面的结果表明,对于N> 100k颗粒,每个颗粒仅5-50个接触是有利的。

enter image description here


import numpy as np
cimport numpy as np
cimport cython

cdef extern from "<vector>" namespace "std":
    cdef cppclass vector[T]:
        cppclass iterator:
            T operator*()
            iterator operator++()
            bint operator==(iterator)
            bint operator!=(iterator)
        void push_back(T&)
        T& operator[](int)
        T& at(int)
        iterator begin()
        iterator end()

np.import_array() # initialize C API to call PyArray_SimpleNewFromData
cdef public api tonumpyarray(int* data, long long size) with gil:
    if not (data and size >= 0): raise ValueError
    cdef np.npy_intp dims = size
    #NOTE: it doesn't take ownership of `data`. You must free `data` yourself
    return np.PyArray_SimpleNewFromData(1, &dims, np.NPY_INT, <void*>data)

def contactsCython(inArray, cutoff):
    inArray = np.asarray(inArray, dtype = np.float64, order = "C")
    cdef int N = len(inArray)
    cdef np.ndarray[np.double_t, ndim = 2] data = inArray
    cdef int j,i
    cdef double curdist
    cdef double cutoff2 = cutoff * cutoff  # IMPORTANT to avoid slow sqrt calculation
    cdef vector[int] contacts1
    cdef vector[int] contacts2
    for i in range(N):
        for j in range(i+1, N):
            curdist = (data[i,0] - data[j,0]) **2 +(data[i,1] - data[j,1]) **2 + (data[i,2] - data[j,2]) **2
            if curdist < cutoff2:
    cdef int M = len(contacts1)

    cdef np.ndarray[np.int32_t, ndim = 2] contacts = np.zeros((M,2), dtype = np.int32)
    for i in range(M):
        contacts[i,0] = contacts1[i]
        contacts[i,1] = contacts2[i]
    return contacts


    cython --cplus fastContacts.pyx
    g++  -g -march=native -Ofast -fpic -c   fastContacts.cpp -o fastContacts.o `python-config --includes`
    g++  -g -march=native -Ofast -shared  -o fastContacts.so  fastContacts.o `python-config --libs`


from __future__ import print_function, division

import signal
import time
from contextlib import contextmanager

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import ckdtree
from scipy.spatial.distance import pdist

from contactmaps import giveContactsOpenMM  # remove this unless you have OpenMM and openmm-polymer libraries installed
from fastContacts import contactsCython

class TimeoutException(Exception): pass

def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")

    signal.signal(signal.SIGALRM, signal_handler)

matplotlib.rcParams.update({'font.size': 8})

def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return np.array(list(pairs))

def condensed_to_pair_indices(n, k):
    x = n - (4. * n ** 2 - 4 * n - 8 * k + 1) ** .5 / 2 - .5
    i = x.astype(int)
    j = k + i * (i + 3 - 2 * n) / 2 + 1
    return np.array([i, j]).T

def close_pairs_pdist(X, max_d):
    d = pdist(X)
    k = (d < max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0], k)

a = np.random.random((100, 3)) * 3  # test set
methods = {"cython": contactsCython, "ckdtree": close_pairs_ckdtree, "OpenMM": giveContactsOpenMM,
           "pdist": close_pairs_pdist}

# checking that each method gives the same value
allUniqueInds = []
for ind, method in methods.items():
    contacts = method(a, 1)
    uniqueInds = contacts[:, 0] + 100 * contacts[:, 1]  # unique index of each contacts
    allUniqueInds.append(np.sort(uniqueInds))  # adding sorted unique conatcts
for j in allUniqueInds:
    assert np.allclose(j, allUniqueInds[0])

# now actually doing testing
repeats = [30,30,30, 30, 30, 20,  20,   10,   5,   3,     2 ,       1,     1,      1]
sizes =    [10,30,100, 200, 300,  500, 1000, 2000, 3000, 10000, 30000, 100000, 300000, 1000000]
systems = [[np.random.random((n, 3)) * ((n / 0.2) ** 0.333333) for k in range(repeat)] for n, repeat in
           zip(sizes, repeats)]

for j, radius in enumerate([0.5, 1, 2, 4, 7, 10]):
    plt.subplot(2, 3, j + 1)
    plt.title("Radius = {0}; {1:.2f} cont per particle".format(radius, 0.2 * (4 / 3 * np.pi * radius ** 3)))

    times = {i: [] for i in methods}

    for name, method in methods.items():
        for n, system, repeat in zip(sizes, systems, repeats):
            if name == "pdist" and n > 30000:
                break  # memory issues
            st = time.time()
                with time_limit(5 * repeat):
                    for ind in range(repeat):
                        k = len(method(system[ind], radius))
                print("Run aborted")
            end = time.time()
            mytime = (end - st) / repeat
            times[name].append((n, mytime))
            print("{0} radius={1} n={2} time={3} repeat={4} contPerParticle={5}".format(name, radius, n, mytime,repeat, 2 * k / n))

    for name in sorted(times.keys()):
        plt.plot(*zip(*times[name]), label=name)
    plt.xlabel("System size")
    plt.ylabel("Time (seconds)")


import numpy as np
from scipy.spatial.distance import cdist
from scipy.spatial import ckdtree

def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return np.array(list(pairs))

def test():
    X = np.random.rand(100,2)*20
    p = close_pairs(X,2)
    q = close_pairs_ckdtree(X, 2)

    from matplotlib import pyplot as plt



def condensed_to_pair_indices(n,k):
    x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
    i = x.astype(int)
    j = k+i*(i+3-2*n)/2+1
    return i,j


def close_pairs_pdist(X,max_d):
    d = pdist(X)
    k = (d<max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0],k)


import numpy as np
from scipy.spatial.distance import cdist
from scipy.misc import comb

def close_pairs(X,max_d):
    d = cdist(X,X)
    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def close_pairs1(X,max_d):
    d = cdist(X,X)
    d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
    d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
    I,J = (d<max_d).nonzero()
    pairs = np.vstack((I,J)).T
    return pairs

def close_pairs3(X, max_d):
    d = pdist(X)
    n = len(X)
    pairs = np.zeros((0,2))
    for i in range(n):
        for j in range(i+1,n):
            # formula from http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html
            a=d[int(comb(n,2)-comb(n-i,2)+j-i-1+0.1)] # the +0.1 is because otherwise i get floating point trouble
                pairs = np.r_[pairs, np.array([i,j])[None,:]]
    return pairs

def close_pairs4(X, max_d):
    d = pdist(X)
    n = len(X)
    a = np.where(d<max_d)[0]
    i = np.arange(n)[:,None]
    j = np.arange(n)[None,:]
    b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
    d1 = np.tril_indices(n)
    b[d1] = -1

    pairs = np.zeros((0,2), dtype=int)

    # next part is the bottleneck: the np.where each time, 
    for v in a:
        i, j = np.where(v==b) 
        pairs = np.r_[pairs, np.array([i[0],j[0]])[None,:]]
    return pairs

def close_pairs5(X, max_d):
    d = pdist(X)
    n = len(X)
    a = np.where(d<max_d)[0]
    i = np.arange(n)[:,None]
    j = np.arange(n)[None,:]
    t1 = time.time()
    b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
    d1 = np.tril_indices(n)
    b[d1] = -1
    t2 = time.time()
    V = b[:,:,None]-a[None,None,:] # takes a little time
    t3 = time.time()
    p = np.where(V==0) # takes most of the time, thought that removing the for-loop from the previous method might improve it, but it does not do that much. This method contains the formula you wanted though, but apparently it is still faster if you use the cdist methods
    t4 = time.time()
    pairs = np.vstack((p[0],p[1])).T
    print t4-t3,t3-t2, t2-t1, t1-t0
    return pairs

def test():
    X = np.random.rand(1000,2)*20
    import time
    t0 = time.time()
    p = close_pairs(X,2)
    t1 = time.time()
    p2 = close_pairs1(X,2)
    t2 = time.time()
    print t2-t1, t1-t0

    from matplotlib import pyplot as plt


注意:如果你做1K点就绘制滞后,但是它需要1K点来比较速度,但我检查它是否正确绘制它如果用100点做它 速度差异大约是10%到20%,我认为它不会比这更好,因为我摆脱了所有排序和独特元素的东西,所以大部分时间占用的部分可能是{{ 1}}行

编辑:更多的测试表明,在那些时候,该cdist线占用大约0.065秒,而你方法的其余部分约为0.02,对我来说约为0.015秒左右。结论:代码的主要瓶颈是d = cdist(X, X)行,我改变的东西加快了你得到的其余代码,但主要的瓶颈仍然存在

编辑:添加方法close_pairs3,它给你公式,但速度快,(仍然需要弄清楚如何反转该公式,而且它将是超高速,将明天这样做 - 将使用np.where( pdist(X)

编辑:添加方法close_pairs4,稍好于3,并解释会发生什么,但是速度慢,与方法5相同,没有那个for循环,但仍然非常慢< / p>

注意:我使用scipy 0.11并且不能使用ckdtree解决方案(仅限kdtree),我希望它更慢。 scipy v0.12 +的任何人都可以运行此代码吗?

import numpy as np
from scipy.spatial.distance import cdist, pdist
from scipy.spatial import ckdtree
from scipy.spatial import kdtree

def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def condensed_to_pair_indices(n,k):
    x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
    i = x.astype(int)
    j = k+i*(i+3-2*n)/2+1
    return i,j

def close_pairs_pdist(X,max_d):
    d = pdist(X)
    k = (d<max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0],k)

def close_pairs_triu(X,max_d):
    d = cdist(X,X)
    d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
    d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
    I,J = (d<max_d).nonzero()
    pairs = np.vstack((I,J)).T
    return pairs

def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return pairs       # remove the conversion as it is not required

def close_pairs_kdtree(X, max_d):
    tree  = kdtree.KDTree(X)
    pairs = tree.query_pairs(max_d)
    return pairs       # remove the conversion as it is not required

methods = [close_pairs, close_pairs_pdist, close_pairs_triu, close_pairs_kdtree] #, close_pairs_ckdtree]

def time_test(n=[10,50,100], max_d=[5,10,50], iter_num=100):
    import timeit

    for method in methods:
        print '-- time using ' + method.__name__ + ' ---'
        for ni in n:
            for d in max_d:
                setup = '\n'.join(['import numpy as np','import %s' % __name__,'np.random.seed(0)','X = np.random.rand(%d,2)*100'%ni])
                stmt  = 'close_pairs.%s(X,%f)' % (method.__name__,d)
                time  = timeit.timeit(stmt=stmt, setup=setup, number=iter_num)/iter_num
                print 'n=%3d, max_d=%2d: \t%.2fms' % (ni, d,time*1000)


-- time using close_pairs ---
n= 20, max_d= 1:    0.22ms
n= 20, max_d= 5:    0.16ms
n= 20, max_d=10:    0.21ms
n=100, max_d= 1:    0.41ms
n=100, max_d= 5:    0.53ms
n=100, max_d=10:    0.97ms
n=500, max_d= 1:    7.12ms
n=500, max_d= 5:    12.28ms
n=500, max_d=10:    33.41ms
-- time using close_pairs_pdist ---
n= 20, max_d= 1:    0.11ms
n= 20, max_d= 5:    0.10ms
n= 20, max_d=10:    0.11ms
n=100, max_d= 1:    0.19ms
n=100, max_d= 5:    0.19ms
n=100, max_d=10:    0.19ms
n=500, max_d= 1:    2.31ms
n=500, max_d= 5:    2.82ms
n=500, max_d=10:    2.49ms
-- time using close_pairs_triu ---
n= 20, max_d= 1:    0.17ms
n= 20, max_d= 5:    0.16ms
n= 20, max_d=10:    0.16ms
n=100, max_d= 1:    0.83ms
n=100, max_d= 5:    0.80ms
n=100, max_d=10:    0.80ms
n=500, max_d= 1:    23.64ms
n=500, max_d= 5:    22.87ms
n=500, max_d=10:    22.96ms
-- time using close_pairs_kdtree ---
n= 20, max_d= 1:    1.71ms
n= 20, max_d= 5:    1.69ms
n= 20, max_d=10:    1.96ms
n=100, max_d= 1:    34.99ms
n=100, max_d= 5:    35.47ms
n=100, max_d=10:    34.91ms
n=500, max_d= 1:    253.87ms
n=500, max_d= 5:    255.05ms
n=500, max_d=10:    256.66ms

