ind pos
x y z
-1.0 7.0 0.0 21 [-2.76788330078, 217.786453247, 26.6822681427]
0.0 22 [-7.23852539062, 217.274139404, 26.6758270264]
0.0 1.0 152 [-0.868591308594, 2.48404550552, 48.4036369324]
6.0 2.0 427 [-0.304443359375, 182.772140503, 79.4475860596]
def dist(a, b):
diff = N.array(a)-N.array(b)
d = N.sqrt(N.dot(diff, diff))
return d
def getPairs(a, b):
if N.array_equal(a, b):
pairs = [(a[i], b[j]) for i in range(len(a)) for j in range(i+1,
pairs = [(a[i], b[j]) for i in range(len(a)) for j in range(len(b))]
return pairs
pairs = [getPairs(list(group.ind), list(boxes.get_group((name[0]+i, name[1]+j, name[2]+k)).ind)) \
for i in [0,1] for j in [0,1] for k in [0,1] if name[0]+i != 34 and name[1]+j != 34 and name[2]+k != 34]
pairs = list(itertools.chain(*pairs))
subInfo = pandas.DataFrame()
subInfo['pairs'] = pairs
subInfo['r'] = subInfo.pairs.apply(lambda x: dist(df_yz.query('ind == @x[0]').pos[0], df_yz.query('ind == @x[1]').pos[0]))
pair distance
(21, 22) 22.59
(21, 152) 15.01
(22, 427) 19.22
答案 0 :(得分:1)
花点时间,但这是您可能的解决方法。希望他们能自我解释。在Jupyter Notebook中以Python 3.x编写。备注:如果您的坐标是世界坐标,则可以考虑使用Haversine距离(圆距离)而不是直线的欧几里得距离。
import pandas as pd
import numpy as np
values = [
{ 'x':-1.0, 'y':7.0, 'z':0.0, 'ind':21, 'pos':[-2.76788330078, 217.786453247, 26.6822681427] },
{ 'z':0.0, 'ind':22, 'pos':[-7.23852539062, 217.274139404, 26.6758270264] },
{ 'y':0.0, 'z':1.0, 'ind':152, 'pos':[-0.868591308594, 2.48404550552, 48.4036369324] },
{ 'y':6.0, 'z':2.0, 'ind':427, 'pos':[-0.304443359375, 182.772140503, 79.4475860596] }
def dist(a, b):
Calculates the Euclidean distance between two 3D-vectors.
diff = np.array(a) - np.array(b)
d = np.sqrt(np.dot(diff, diff))
return d
df_initial = pd.DataFrame(values)
pairs distance
1 (21, 22) 4.499905
3 (21, 427) 63.373886
7 (22, 427) 63.429709
df = df_initial.copy()
# join data with itself, each line will contain two geo-positions
df['tmp'] = 1
df = df.merge(df, on='tmp', suffixes=['1', '2']).drop('tmp', axis=1)
# remove rows with similar index
df = df[df['ind1'] != df['ind2']]
# calculate distance for all
df['distance'] = df.apply(lambda row: dist(row['pos1'], row['pos2']), axis=1)
# filter only those within a specific distance
df = df[df['distance'] < 70]
# combine original indices into a tuple
df['pairs'] = list(zip(df['ind1'], df['ind2']))
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
df = df_initial.copy()
results = list()
for index, row1 in df.iterrows():
# calculate distance between current coordinate and all original rows in the data
df['distance'] = df.apply(lambda row2: dist(row1['pos'], row2['pos']), axis=1)
# filter only those within a specific distance and drop rows with same index as current coordinate
df_tmp = df[(df['distance'] < 70) & (df['ind'] != row1['ind'])].copy()
# prepare final data
df_tmp['ind2'] = row1['ind']
df_tmp['pairs'] = list(zip(df_tmp['ind'], df_tmp['ind2']))
# remember data
# combine all into one dataframe
df = pd.concat(results)
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
from scipy import spatial
tree = spatial.KDTree(list(df_initial['pos']))
# calculate distances (returns a sparse matrix)
distances = tree.sparse_distance_matrix(tree, max_distance=70)
# convert to a Coordinate (coo) representation of the Compresses-Sparse-Column (csc) matrix.
coo = distances.tocoo(copy=False)
def get_cell_value(idx: int, column: str = 'ind'):
return df_initial.iloc[idx][column]
def extract_indices(row):
distance, idx1, idx2 = row
return get_cell_value(int(idx1)), get_cell_value(int(idx2))
df = pd.DataFrame({'idx1': coo.row, 'idx2': coo.col, 'distance': coo.data})
df['pairs'] = df.apply(extract_indices, axis=1)
# select columns of interest
df = df[['pairs', 'distance']]
def sort_tuple(idx):
x, y = idx
if y < x:
return y, x
return x, y
# sort values of each tuple from low to high
df['pairs'] = df['pairs'].apply(sort_tuple)
# drop duplicates
df.drop_duplicates(subset=['pairs'], inplace=True)
# print result
那性能呢?如果您只想知道原始数据的哪一行在所需距离之内,那么KDTree版本(第三版)将非常快。生成稀疏矩阵只需花费 4ms 。但是由于我随后使用了该矩阵中的索引来从原始数据中提取数据,因此性能下降了。当然,应该在完整的数据集上对此进行测试。