嗨,我编写了一个代码,用于为每个键找到5个或更多相同的元素。
dictionary = {'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1], 'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17], 'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3], 'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18], 'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13], 'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10], 'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7], 'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]}
clusterDict = {}
for key, value in dictionary.items():
for searchKey, searchValue in dictionary.items():
if key != searchKey:
intersectionList = list(set(value).intersection(searchValue))
intersectionList.sort()
if len(intersectionList) >= 5:
if str(intersectionList) not in clusterDict:
clusterDict[str(intersectionList)] = [key,searchKey]
else:
clusterDict[str(intersectionList)].append(key)
clusterDict[str(intersectionList)].append(searchKey)
for key, value in clusterDict.items():
clusterDict[key] = list(set(value))
print(clusterDict)
如果我将更多键值对添加到字典中。处理速度会大大降低。我想知道是否有任何方法可以更快或更优化地找到路口/公共物品。预先谢谢你
答案 0 :(得分:0)
您可以通过将所有列表预先设置为集合,并且不进行冗余检查来节省大量的时间(从某种意义上说,对于列表[A, B, C]
,您当前的代码可以有效地同时检查A intersect B
和B intersect A
)。
您可以利用itertools.combinations
生成所有可能的组合。
from itertools import combinations
dictionary = {'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1], 'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17], 'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3], 'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18], 'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13], 'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10], 'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7], 'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]}
dict_of_sets = {k:set(v) for k,v in dictionary.items()}
clusterDict = {}
for (key1, value1), (key2, value2) in combinations(dict_of_sets.items(),2):
intersect = value1.intersection(value2)
if len(intersect) >= 5:
#change keyword tuple to str if you wish to.
clusterDict.setdefault(tuple(sorted(intersect)),[]).extend([key1, key2])
请注意,您还可以使用元组作为字典键,这在我看来至少比将列表类型转换为字符串更干净。但是,请随时根据需要更改该部分。
这应该更快,但是随着这些事情的进行,令人遗憾的是,这仍然是O(N^2)
复杂性解决方案。我不知道一种进一步降低复杂性的方法。
答案 1 :(得分:0)
如果我了解您要执行的操作,那么我认为这要复杂得多。最后,您需要遍历所有可能的值交集的lattice(请参阅this figure以了解我的意思)。我为您的问题编写了以下函数:
def findClusters(dictionary, minSize):
# Make a list with the initial pairs of set and name
# Since two names may have all the same values each item is
# a set of values and a set of names
setList = {}
for k, v in dictionary.items():
if len(v) >= minSize:
v = frozenset(v)
setList.setdefault(v, set()).add(k)
setList = list(setList.items())
# Build the clusters dictionary
clusterDict = {}
# Iterate the list values and names
for i, (s, k) in enumerate(setList):
if len(k) > 1:
# This happens if two names have the same values,
# in which case that is already a cluster
clusterDict.setdefault(s, set()).update(k)
# This is the list of "open" lattice nodes
open = [(s, k)]
# This is the list of lists of continuations for each lattice node
# Initially a node can be followed by any of the nodes after it in setList
follows = [setList[i + 1:]]
# While there are open nodes
while open and follows:
# Get the current node and its possible continuations
(s1, k1), *open = open
follow, *follows = follows
# For each continuation
for j, (s2, k2) in enumerate(follow):
# Get the intersection of values of this node and the continuation
s = s1.intersection(s2)
# Only continue if it is big enough
if len(s) >= minSize:
# Set of names for the node plus the continuation
k = k1.union(k2)
# Add the names to the cluster in the dictionary
clusterDict.setdefault(s, set()).update(k)
# Add the new node to the open list
open.append((s, k))
# The continuations for the new node are all the continuations after this one
follows.append(follow[j + 1:])
return clusterDict
一个小例子:
dictionary = {
'A': [1, 2, 3, 4],
'B': [1, 2, 3],
'C': [1, 2, 3, 4],
'D': [1, 4],
}
minSize = 2
print(*findClusters(dictionary, minSize).items(), sep='\n')
输出:
(frozenset({1, 2, 3, 4}), {'C', 'A'})
(frozenset({1, 2, 3}), {'C', 'A', 'B'})
(frozenset({1, 4}), {'C', 'D', 'A'})
带有问题中的数据:
dictionary = {
'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1],
'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17],
'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3],
'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18],
'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13],
'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10],
'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7],
'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]
}
minSize = 5
print(*findClusters(dictionary, minSize).items(), sep='\n')
输出:
(frozenset({0, 6, 7, 9, 18}), {'Mary', 'Paul', 'John', 'Joe'})
(frozenset({0, 6, 7, 8, 9, 18}), {'Joe', 'Mary'})
(frozenset({6, 7, 8, 9, 11, 18}), {'Maggie', 'Mary'})
(frozenset({0, 6, 7, 11, 18}), {'Ken', 'Mary'})
(frozenset({0, 1, 8, 11, 19}), {'Roger', 'Mary'})
(frozenset({6, 7, 8, 9, 18}), {'Maggie', 'Joe', 'Mary'})
(frozenset({0, 5, 6, 7, 9, 17, 18}), {'Paul', 'John'})
(frozenset({0, 2, 4, 6, 7, 9, 18}), {'John', 'Joe'})
(frozenset({2, 4, 5, 6, 7, 13}), {'Peter', 'John'})
(frozenset({4, 5, 6, 7, 9, 13, 18}), {'Maggie', 'John'})
(frozenset({0, 2, 4, 6, 7, 17, 18}), {'Ken', 'John'})
(frozenset({5, 6, 7, 9, 18}), {'Maggie', 'Paul', 'John'})
(frozenset({0, 6, 7, 17, 18}), {'Paul', 'Ken', 'John'})
(frozenset({4, 6, 7, 9, 18}), {'Maggie', 'John', 'Joe'})
(frozenset({0, 2, 4, 6, 7, 18}), {'Ken', 'John', 'Joe'})
(frozenset({4, 5, 6, 7, 13}), {'Maggie', 'Peter', 'John'})
(frozenset({0, 3, 6, 7, 9, 18}), {'Paul', 'Joe'})
(frozenset({3, 5, 6, 7, 16}), {'Paul', 'Peter'})
(frozenset({0, 6, 7, 16, 17, 18}), {'Paul', 'Ken'})
(frozenset({2, 3, 4, 6, 7, 15}), {'Peter', 'Joe'})
(frozenset({4, 6, 7, 8, 9, 18}), {'Maggie', 'Joe'})
(frozenset({0, 2, 4, 6, 7, 15, 18}), {'Ken', 'Joe'})
(frozenset({2, 4, 6, 7, 15}), {'Peter', 'Ken', 'Joe'})
(frozenset({4, 5, 6, 7, 10, 13}), {'Maggie', 'Peter'})
(frozenset({2, 4, 6, 7, 15, 16}), {'Peter', 'Ken'})
(frozenset({4, 6, 7, 11, 18}), {'Maggie', 'Ken'})