找到比给定最大距离更近的所有点对

时间:2014-01-22 14:03:23

标签: python numpy scipy coordinates distance

我想(有效地)找到比距离max_d更近的所有点对。我目前使用cdist的方法是:

import numpy as np
from scipy.spatial.distance import cdist

def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def test():
    X = np.random.rand(100,2)*20
    p = close_pairs(X,2)

    from matplotlib import pyplot as plt
    plt.clf()
    plt.plot(X[:,0],X[:,1],'.r')
    plt.plot(X[p,0].T,X[p,1].T,'-b')

但我认为这是过度的(而且不是很易读),因为大部分工作只是为了消除距离自我和重复。

我的主要问题是:有更好的方法吗?

(注意:输出类型(arrayset,......)此时并不重要)

我目前的想法是使用pdist返回一个只包含正确对的压缩距离数组。但是,一旦我从压缩距离数组中找到合适的坐标k,我该如何计算它等同于哪个i,j对?

所以替代问题是:有一种简单的方法来获取相对于pdist输出条目的坐标对列表:

  • 一个函数f(k)->i,j
  • 这样cdist(X,X)[i,j] = pdist(X)[k]

5 个答案:

答案 0 :(得分:4)

根据我的经验,有两种最快的方法可以在3D中查找邻居列表。一种是使用用C ++或Cython编写的最天真的双循环代码(在我的例子中,两者都是)。它以N ^ 2运行,但对于小型系统来说非常快。另一种方法是使用线性时间算法。 Scipy ckdtree是一个不错的选择,但有局限性。来自分子动力学软件的邻居列表查找器功能最强大,但很难包装,并且初始化时间可能很慢。

下面我比较四种方法:

  • 天真的cython代码
  • OpenMM周围的包装(非常难以安装,见下文)
  • Scipy.spatial.ckdtree
  • scipy.spatial.distance.pdist

测试设置:n点散布在体积密度为0.2的矩形框中。系统大小范围从10到1000000(百万)粒子。联系半径取自0.5, 1, 2, 4, 7, 10。注意,因为密度是0.2,在接触半径0.5处,我们平均每个颗粒具有约0.1个接触,在1 = 0.8,2 = 6.4,并且在10-约800!对于小系统,重复接触发现几次,对于系统> 30k颗粒进行一次。如果每次通话的时间超过5秒,则运行中止。

设置:双xeon 2687Wv3,128GB RAM,Ubuntu 14.04,python 2.7.11,scipy 0.16.0,numpy 1.10.1。没有一个代码使用并行优化(除了OpenMM,虽然并行部分非常快,以至于在CPU图上甚至都没有注意到,大部分时间都是从OpenMM管道数据)。

结果:请注意,下面的图是logscale,并且分布在6个数量级以上。即使很小的视觉差异实际上也可能是10倍。 对于小于1000个粒子的系统,Cython代码总是更快。但是,1000个粒子后的结果取决于接触半径。 pdist实现总是比cython慢​​,并且需要更多的内存,因为它明确地创建了一个距离矩阵,由于sqrt,它很慢。

  • 在小接触半径(每个粒子接触1次)时,ckdtree是所有系统尺寸的理想选择。
  • 在中等接触半径,(每个粒子5-50个接触点)天真的cython实现是最好的10000个粒子,然后OpenMM开始赢得大约几个数量级,但ckdtree仅执行3-10个时间更糟
  • 在高接触半径(每个粒子接触200个接触点)时,天然方法可以处理高达100k或1M的粒子,然后OpenMM可能会获胜。

安装OpenMM非常棘手;您可以在http://bitbucket.org/mirnylab/openmm-polymer文件&#34; contactmaps.py&#34;中阅读更多内容。或在自述文件中。然而,下面的结果表明,对于N> 100k颗粒,每个颗粒仅5-50个接触是有利的。

enter image description here

以下Cython代码:

import numpy as np
cimport numpy as np
cimport cython

cdef extern from "<vector>" namespace "std":
    cdef cppclass vector[T]:
        cppclass iterator:
            T operator*()
            iterator operator++()
            bint operator==(iterator)
            bint operator!=(iterator)
        vector()
        void push_back(T&)
        T& operator[](int)
        T& at(int)
        iterator begin()
        iterator end()

np.import_array() # initialize C API to call PyArray_SimpleNewFromData
cdef public api tonumpyarray(int* data, long long size) with gil:
    if not (data and size >= 0): raise ValueError
    cdef np.npy_intp dims = size
    #NOTE: it doesn't take ownership of `data`. You must free `data` yourself
    return np.PyArray_SimpleNewFromData(1, &dims, np.NPY_INT, <void*>data)

@cython.boundscheck(False)
@cython.wraparound(False)
def contactsCython(inArray, cutoff):
    inArray = np.asarray(inArray, dtype = np.float64, order = "C")
    cdef int N = len(inArray)
    cdef np.ndarray[np.double_t, ndim = 2] data = inArray
    cdef int j,i
    cdef double curdist
    cdef double cutoff2 = cutoff * cutoff  # IMPORTANT to avoid slow sqrt calculation
    cdef vector[int] contacts1
    cdef vector[int] contacts2
    for i in range(N):
        for j in range(i+1, N):
            curdist = (data[i,0] - data[j,0]) **2 +(data[i,1] - data[j,1]) **2 + (data[i,2] - data[j,2]) **2
            if curdist < cutoff2:
                contacts1.push_back(i)
                contacts2.push_back(j)
    cdef int M = len(contacts1)

    cdef np.ndarray[np.int32_t, ndim = 2] contacts = np.zeros((M,2), dtype = np.int32)
    for i in range(M):
        contacts[i,0] = contacts1[i]
        contacts[i,1] = contacts2[i]
    return contacts

Cython代码的编译(或makefile):

    cython --cplus fastContacts.pyx
    g++  -g -march=native -Ofast -fpic -c   fastContacts.cpp -o fastContacts.o `python-config --includes`
    g++  -g -march=native -Ofast -shared  -o fastContacts.so  fastContacts.o `python-config --libs`

测试代码:

from __future__ import print_function, division

import signal
import time
from contextlib import contextmanager

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import ckdtree
from scipy.spatial.distance import pdist

from contactmaps import giveContactsOpenMM  # remove this unless you have OpenMM and openmm-polymer libraries installed
from fastContacts import contactsCython


class TimeoutException(Exception): pass


@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")

    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)


matplotlib.rcParams.update({'font.size': 8})


def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return np.array(list(pairs))


def condensed_to_pair_indices(n, k):
    x = n - (4. * n ** 2 - 4 * n - 8 * k + 1) ** .5 / 2 - .5
    i = x.astype(int)
    j = k + i * (i + 3 - 2 * n) / 2 + 1
    return np.array([i, j]).T


def close_pairs_pdist(X, max_d):
    d = pdist(X)
    k = (d < max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0], k)


a = np.random.random((100, 3)) * 3  # test set
methods = {"cython": contactsCython, "ckdtree": close_pairs_ckdtree, "OpenMM": giveContactsOpenMM,
           "pdist": close_pairs_pdist}

# checking that each method gives the same value
allUniqueInds = []
for ind, method in methods.items():
    contacts = method(a, 1)
    uniqueInds = contacts[:, 0] + 100 * contacts[:, 1]  # unique index of each contacts
    allUniqueInds.append(np.sort(uniqueInds))  # adding sorted unique conatcts
for j in allUniqueInds:
    assert np.allclose(j, allUniqueInds[0])

# now actually doing testing
repeats = [30,30,30, 30, 30, 20,  20,   10,   5,   3,     2 ,       1,     1,      1]
sizes =    [10,30,100, 200, 300,  500, 1000, 2000, 3000, 10000, 30000, 100000, 300000, 1000000]
systems = [[np.random.random((n, 3)) * ((n / 0.2) ** 0.333333) for k in range(repeat)] for n, repeat in
           zip(sizes, repeats)]

for j, radius in enumerate([0.5, 1, 2, 4, 7, 10]):
    plt.subplot(2, 3, j + 1)
    plt.title("Radius = {0}; {1:.2f} cont per particle".format(radius, 0.2 * (4 / 3 * np.pi * radius ** 3)))

    times = {i: [] for i in methods}

    for name, method in methods.items():
        for n, system, repeat in zip(sizes, systems, repeats):
            if name == "pdist" and n > 30000:
                break  # memory issues
            st = time.time()
            try:
                with time_limit(5 * repeat):
                    for ind in range(repeat):
                        k = len(method(system[ind], radius))
            except:
                print("Run aborted")
                break
            end = time.time()
            mytime = (end - st) / repeat
            times[name].append((n, mytime))
            print("{0} radius={1} n={2} time={3} repeat={4} contPerParticle={5}".format(name, radius, n, mytime,repeat, 2 * k / n))

    for name in sorted(times.keys()):
        plt.plot(*zip(*times[name]), label=name)
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel("System size")
    plt.ylabel("Time (seconds)")
    plt.legend(loc=0)

plt.show()

答案 1 :(得分:2)

以下是使用cKDTree模块的方法。见query_pairs

import numpy as np
from scipy.spatial.distance import cdist
from scipy.spatial import ckdtree


def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs

def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return np.array(list(pairs))

def test():
    np.random.seed(0)
    X = np.random.rand(100,2)*20
    p = close_pairs(X,2)
    q = close_pairs_ckdtree(X, 2)

    from matplotlib import pyplot as plt
    plt.plot(X[:,0],X[:,1],'.r')
    plt.plot(X[p,0].T,X[p,1].T,'-b')
    plt.figure()
    plt.plot(X[:,0],X[:,1],'.r')
    plt.plot(X[q,0].T,X[q,1].T,'-b')

    plt.show()

t

答案 2 :(得分:2)

我终于自己找到了。将压缩距离数组中的索引k转换为方形距离数组中的等效i,j的函数是:

def condensed_to_pair_indices(n,k):
    x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
    i = x.astype(int)
    j = k+i*(i+3-2*n)/2+1
    return i,j

我必须与sympy玩一点才能找到它。现在,计算所有点对比小于给定距离:

def close_pairs_pdist(X,max_d):
    d = pdist(X)
    k = (d<max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0],k)

正如所料,它比其他方法更有效(但我没有测试ckdtree)。我将更新timeit答案。

答案 3 :(得分:1)

稍微快一些,没有彻底测试时差,但如果我跑了几次,它给我的方法约0.0755529403687的时间,为你的0.0328771495819。我使用triu方法来摆脱数组的上三角形(其中有重复数据),包括对角线(这是自我距离所在的位置),我也不排序,因为如果你绘制它,如果我没关系按顺序绘制它们。所以我猜它加速了大约15%左右

import numpy as np
from scipy.spatial.distance import cdist
from scipy.misc import comb

def close_pairs(X,max_d):
    d = cdist(X,X)
    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs


def close_pairs1(X,max_d):
    d = cdist(X,X)
    d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
    d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
    I,J = (d<max_d).nonzero()
    pairs = np.vstack((I,J)).T
    return pairs

def close_pairs3(X, max_d):
    d = pdist(X)
    n = len(X)
    pairs = np.zeros((0,2))
    for i in range(n):
        for j in range(i+1,n):
            # formula from http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html
            a=d[int(comb(n,2)-comb(n-i,2)+j-i-1+0.1)] # the +0.1 is because otherwise i get floating point trouble
            if(a<max_d):
                pairs = np.r_[pairs, np.array([i,j])[None,:]]
    return pairs

def close_pairs4(X, max_d):
    d = pdist(X)
    n = len(X)
    a = np.where(d<max_d)[0]
    i = np.arange(n)[:,None]
    j = np.arange(n)[None,:]
    b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
    d1 = np.tril_indices(n)
    b[d1] = -1

    pairs = np.zeros((0,2), dtype=int)

    # next part is the bottleneck: the np.where each time, 
    for v in a:
        i, j = np.where(v==b) 
        pairs = np.r_[pairs, np.array([i[0],j[0]])[None,:]]
    return pairs

def close_pairs5(X, max_d):
    t0=time.time()
    d = pdist(X)
    n = len(X)
    a = np.where(d<max_d)[0]
    i = np.arange(n)[:,None]
    j = np.arange(n)[None,:]
    t1 = time.time()
    b = np.array(comb(n,2)-comb(n-i,2)+j-i-1+0.1, dtype=int)
    d1 = np.tril_indices(n)
    b[d1] = -1
    t2 = time.time()
    V = b[:,:,None]-a[None,None,:] # takes a little time
    t3 = time.time()
    p = np.where(V==0) # takes most of the time, thought that removing the for-loop from the previous method might improve it, but it does not do that much. This method contains the formula you wanted though, but apparently it is still faster if you use the cdist methods
    t4 = time.time()
    pairs = np.vstack((p[0],p[1])).T
    print t4-t3,t3-t2, t2-t1, t1-t0
    return pairs





def test():
    X = np.random.rand(1000,2)*20
    import time
    t0 = time.time()
    p = close_pairs(X,2)
    t1 = time.time()
    p2 = close_pairs1(X,2)
    t2 = time.time()
    print t2-t1, t1-t0

    from matplotlib import pyplot as plt
    plt.figure()
    plt.clf()
    plt.plot(X[:,0],X[:,1],'.r')
    plt.plot(X[p,0].T,X[p,1].T,'-b')
    plt.figure()
    plt.clf()
    plt.plot(X[:,0],X[:,1],'.r')
    plt.plot(X[p2,0].T,X[p2,1].T,'-b')
    plt.show()



test()

注意:如果你做1K点就绘制滞后,但是它需要1K点来比较速度,但我检查它是否正确绘制它如果用100点做它 速度差异大约是10%到20%,我认为它不会比这更好,因为我摆脱了所有排序和独特元素的东西,所以大部分时间占用的部分可能是{{ 1}}行

编辑:更多的测试表明,在那些时候,该cdist线占用大约0.065秒,而你方法的其余部分约为0.02,对我来说约为0.015秒左右。结论:代码的主要瓶颈是d = cdist(X, X)行,我改变的东西加快了你得到的其余代码,但主要的瓶颈仍然存在

编辑:添加方法close_pairs3,它给你公式,但速度快,(仍然需要弄清楚如何反转该公式,而且它将是超高速,将明天这样做 - 将使用np.where( pdist(X)

编辑:添加方法close_pairs4,稍好于3,并解释会发生什么,但是速度慢,与方法5相同,没有那个for循环,但仍然非常慢< / p>

答案 4 :(得分:0)

我制作了一些代码来比较提出的解决方案。

注意:我使用scipy 0.11并且不能使用ckdtree解决方案(仅限kdtree),我希望它更慢。 scipy v0.12 +的任何人都可以运行此代码吗?

import numpy as np
from scipy.spatial.distance import cdist, pdist
from scipy.spatial import ckdtree
from scipy.spatial import kdtree


def close_pairs(X,max_d):
    d = cdist(X,X)

    I,J = (d<max_d).nonzero()
    IJ  = np.sort(np.vstack((I,J)), axis=0)

    # remove diagonal element
    IJ  = IJ[:,np.diff(IJ,axis=0).ravel()<>0]

    # remove duplicate
    dt = np.dtype([('i',int),('j',int)])
    pairs = np.unique(IJ.T.view(dtype=dt)).view(int).reshape(-1,2)

    return pairs


def condensed_to_pair_indices(n,k):
    x = n-(4.*n**2-4*n-8*k+1)**.5/2-.5
    i = x.astype(int)
    j = k+i*(i+3-2*n)/2+1
    return i,j

def close_pairs_pdist(X,max_d):
    d = pdist(X)
    k = (d<max_d).nonzero()[0]
    return condensed_to_pair_indices(X.shape[0],k)


def close_pairs_triu(X,max_d):
    d = cdist(X,X)
    d1 = np.triu_indices(len(X)) # indices of the upper triangle including the diagonal
    d[d1] = max_d+1 # value that will not get selected when doing d<max_d in the next line
    I,J = (d<max_d).nonzero()
    pairs = np.vstack((I,J)).T
    return pairs

def close_pairs_ckdtree(X, max_d):
    tree = ckdtree.cKDTree(X)
    pairs = tree.query_pairs(max_d)
    return pairs       # remove the conversion as it is not required

def close_pairs_kdtree(X, max_d):
    tree  = kdtree.KDTree(X)
    pairs = tree.query_pairs(max_d)
    return pairs       # remove the conversion as it is not required


methods = [close_pairs, close_pairs_pdist, close_pairs_triu, close_pairs_kdtree] #, close_pairs_ckdtree]

def time_test(n=[10,50,100], max_d=[5,10,50], iter_num=100):
    import timeit

    for method in methods:
        print '-- time using ' + method.__name__ + ' ---'
        for ni in n:
            for d in max_d:
                setup = '\n'.join(['import numpy as np','import %s' % __name__,'np.random.seed(0)','X = np.random.rand(%d,2)*100'%ni])
                stmt  = 'close_pairs.%s(X,%f)' % (method.__name__,d)
                time  = timeit.timeit(stmt=stmt, setup=setup, number=iter_num)/iter_num
                print 'n=%3d, max_d=%2d: \t%.2fms' % (ni, d,time*1000)

time_test(iter_num=10,n=[20,100,500],max_d=[1,5,10])的输出为:

-- time using close_pairs ---
n= 20, max_d= 1:    0.22ms
n= 20, max_d= 5:    0.16ms
n= 20, max_d=10:    0.21ms
n=100, max_d= 1:    0.41ms
n=100, max_d= 5:    0.53ms
n=100, max_d=10:    0.97ms
n=500, max_d= 1:    7.12ms
n=500, max_d= 5:    12.28ms
n=500, max_d=10:    33.41ms
-- time using close_pairs_pdist ---
n= 20, max_d= 1:    0.11ms
n= 20, max_d= 5:    0.10ms
n= 20, max_d=10:    0.11ms
n=100, max_d= 1:    0.19ms
n=100, max_d= 5:    0.19ms
n=100, max_d=10:    0.19ms
n=500, max_d= 1:    2.31ms
n=500, max_d= 5:    2.82ms
n=500, max_d=10:    2.49ms
-- time using close_pairs_triu ---
n= 20, max_d= 1:    0.17ms
n= 20, max_d= 5:    0.16ms
n= 20, max_d=10:    0.16ms
n=100, max_d= 1:    0.83ms
n=100, max_d= 5:    0.80ms
n=100, max_d=10:    0.80ms
n=500, max_d= 1:    23.64ms
n=500, max_d= 5:    22.87ms
n=500, max_d=10:    22.96ms
-- time using close_pairs_kdtree ---
n= 20, max_d= 1:    1.71ms
n= 20, max_d= 5:    1.69ms
n= 20, max_d=10:    1.96ms
n=100, max_d= 1:    34.99ms
n=100, max_d= 5:    35.47ms
n=100, max_d=10:    34.91ms
n=500, max_d= 1:    253.87ms
n=500, max_d= 5:    255.05ms
n=500, max_d=10:    256.66ms

结论:

但是,需要测试ckdtree方法。