假设我有一个坐标列表:
data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
我想把所有出现在数据中每个列表中的元组,或任何与除了它自己的之外的列表中的所有元组相差3或更少的元组。我怎样才能在Python中高效地完成这项工作?
对于上面的例子,结果应该是:
[[(100, 120), # since it occurs in both lists
(10, 20), (13, 20), # since they differ by only 3
(50, 60), (51, 60)]]
不包括(0,5)和(300,400),因为它们不会出现在两个列表中,并且与列表中的元素不同于3或更少。
如何计算?谢谢。
答案 0 :(得分:1)
这种天真的实现将很慢:O(n ^ 2),针对每个其他节点测试每个节点。使用树来加速它。
此实现使用简单的四叉树来提高搜索效率。这并没有任何尝试平衡树,所以一个非常有序的点列表可能会使它非常低效。对于很多用途,简单地改变列表可能会使它足够好;只是一定不要传递很多按坐标排序的项目,因为这会把它减少到一个链表。
这里的优化很简单:如果我们在某个点的3个单位的欧几里德距离内寻找项目,并且我们知道子树中的所有项目都在右边至少3个单位,那么就没有任何一点在那个区域可能不到3个单位。
此代码属于公共领域。尽量不要把它作为家庭作业。
#!/usr/bin/python
import math
def euclidean_distance(pos1, pos2):
x = math.pow(pos1[0] - pos2[0], 2)
y = math.pow(pos1[1] - pos2[1], 2)
return math.sqrt(x + y)
class QuadTreeNode(object):
def __init__(self, pos):
"""
Create a QuadTreeNode at the specified position. pos must be an (x, y) tuple.
Children are classified by quadrant.
"""
# Children of this node are ordered TL, TR, BL, BL (origin top-left).
self.children = [None, None, None, None]
self.pos = pos
def classify_node(self, pos):
"""
Return which entry in children can contain pos. If pos is equal to this
node, return None.
>>> node = QuadTreeNode((10, 20))
>>> node.classify_node((10, 20)) == None
True
>>> node.classify_node((2, 2))
0
>>> node.classify_node((50, 2))
1
>>> node.classify_node((2, 50))
2
>>> node.classify_node((50, 50))
3
X boundary condition:
>>> node.classify_node((10, 2))
0
>>> node.classify_node((10, 50))
2
Y boundary conditoin:
>>> node.classify_node((2, 20))
0
>>> node.classify_node((50, 20))
1
"""
if pos == self.pos:
return None
if pos[0] <= self.pos[0]: # Left
if pos[1] <= self.pos[1]: # Top-left
return 0
else: # Bottom-left
return 2
else: # Right
if pos[1] <= self.pos[1]: # Top-right
return 1
else: # Bottom-right
return 3
assert False, "not reached"
def add_node(self, node):
"""
Add a specified point under this node.
"""
type = self.classify_node(node.pos)
if type is None:
# node is equal to self, so this is a duplicate node. Ignore it.
return
if self.children[type] is None:
self.children[type] = node
else:
# We already have a node there; recurse and add it to the child.
self.children[type].add_node(node)
@staticmethod
def CreateQuadTree(data):
"""
Create a quad tree from the specified list of points.
"""
root = QuadTreeNode(data[0])
for val in data[1:]:
node = QuadTreeNode(val)
root.add_node(node)
return root
def distance_from_pos(self, pos):
return euclidean_distance(self.pos, pos)
def __str__(self): return str(self.pos)
def find_point_within_range(self, pos, distance):
"""
If a point exists within the specified Euclidean distance of the specified
point, return it. Otherwise, return None.
"""
if self.distance_from_pos(pos) <= distance:
return self
for axis in range(0, 4):
if self.children[axis] is None:
# We don't have a node on this axis.
continue
# If moving forward on this axis would permanently put us out of range of
# the point, short circuit the search on that axis.
if axis in (0, 2): # axis moves left on X
if self.pos[0] < pos[0] - distance:
continue
if axis in (1, 3): # axis moves right on X
if self.pos[0] > pos[0] + distance:
continue
if axis in (0, 1): # axis moves up on Y
if self.pos[1] < pos[1] - distance:
continue
if axis in (2, 3): # axis moves down on Y
if self.pos[1] > pos[1] + distance:
continue
node = self.children[axis].find_point_within_range(pos, distance)
if node is not None:
return node
return None
@staticmethod
def find_point_in_range_for_all_trees(point, trees, distance):
"""
If all QuadTreeNodes in trees contain a a point within the specified distance
of point, return True, Otherwise, return False.
"""
for tree in trees:
if tree.find_point_within_range(point, distance) is None:
return False
return True
def test_naive(data, distance):
def find_point_in_list(iter, point):
for i in iter:
if euclidean_distance(i, point) <= distance:
return True
return False
def find_point_in_all_lists(point):
for d in data:
if not find_point_in_list(d, point):
return False
return True
results = []
for d in data:
for point in d:
if find_point_in_all_lists(point):
results.append(point)
return set(results)
def test_tree(data, distance):
trees = [QuadTreeNode.CreateQuadTree(d) for d in data]
results = []
for d in data:
for point in d:
if QuadTreeNode.find_point_in_range_for_all_trees(point, trees, 3):
results.append(point)
return set(results)
def test():
sample_data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
result1 = test_naive(sample_data, 3)
result2 = test_tree(sample_data, 3)
print result1
assert result1 == result2
# Loosely validate the tree algorithm against a lot of sample data, and compare
# performance while we're at it:
def random_data():
import random
return [(random.randint(0,1000), random.randint(0,1000)) for d in range(0,500)]
data = [random_data() for x in range(0,10)]
print "Searching (naive)..."
result1 = test_naive(data, 3)
print "Searching (tree)..."
result2 = test_tree(data, 3)
assert result1 == result2
if __name__ == "__main__":
test()
import doctest
doctest.testmod()
答案 1 :(得分:0)
我希望这会让你开始。任何改进都将不胜感激。
出现在所有列表中都很简单 - 只需获取列表中所有元素的交集。
>>> data = [
... [(10, 20), (100, 120), (0, 5), (50, 60)],
... [(13, 20), (300, 400), (100, 120), (51, 62)]
... ]
>>> dataset = [set(d) for d in data]
>>> dataset[0].intersection(*dataset[1:])
set([(100, 120)])
除了同一列表中的元组之外,“3或更小的差异”在我看来是图形/ 2d空间问题。没有简单的算法,没有多项式算法,如果你的数据集不是很大,你可以迭代它们并对不在同一列表中的关闭点进行分组。
答案 2 :(得分:0)
<= 3
),将“2D平面”虚拟切片为合适的“细胞”,这样每个点只需要与相邻“单元格”中的点进行比较。如果您的数据集很大,这可能真的有帮助(如果需要将每个点与基本上所有其他点进行比较,那就是一个强力解决方案)。
这是一个Python实现的想法(因为barry的代码草图似乎是Perl或其他东西),旨在清晰而不是速度......:
import collections
import math
def cellof(point):
x, y = point
return x//3, y//3
def distance(p1, p2):
return math.hypot(p1[0]-p2[0], p1[1]-p2[1])
def process(data):
cells = collections.defaultdict(list)
for i, points in enumerate(data):
for p in points:
cx, cy = cellof(p)
cells[cx, cy].append((i, p))
res = set()
for c, alist in cells.items():
for i, p in alist:
for cx in range(c[0]-1, c[0]+2):
for cy in range(c[1]-1, c[1]+2):
otherc = cells[cx, cy]
for otheri, otherp in otherc:
if i == otheri: continue
dst = distance(p, otherp)
if dst <= 3: res.add(p)
return sorted(res)
if __name__ == '__main__': # just an example
data = [
[(10, 20), (100, 120), (0, 5), (50, 60)],
[(13, 20), (300, 400), (100, 120), (51, 62)]
]
print process(data)
作为脚本运行时,会生成输出
[(10, 20), (13, 20), (50, 60), (51, 62), (100, 120)]
当然,要确定这是否值得,或者更简单的蛮力方法确实更好,唯一可行的方法是让您在真实数据上运行两种解决方案的基准 - 您的程序在现实生活中实际需要处理的各种数据集。根据您拥有的列表数量,每个列表的数量,间隔距离,性能差异很大,并且测量比猜测更好! - )