优化Python字典中的相交过程

时间:2018-12-06 15:56:40

标签: python algorithm

嗨,我编写了一个代码,用于为每个键找到5个或更多相同的元素。

dictionary = {'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1], 'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17], 'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3], 'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18], 'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13], 'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10], 'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7], 'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]}
clusterDict = {}
for key, value in dictionary.items():
    for searchKey, searchValue in dictionary.items():
        if key != searchKey:
            intersectionList = list(set(value).intersection(searchValue))
            intersectionList.sort()
            if len(intersectionList) >= 5:
                if str(intersectionList) not in clusterDict:
                    clusterDict[str(intersectionList)] = [key,searchKey]
                else:    
                    clusterDict[str(intersectionList)].append(key)
                    clusterDict[str(intersectionList)].append(searchKey)

for key, value in clusterDict.items():
    clusterDict[key] = list(set(value))

print(clusterDict)

如果我将更多键值对添加到字典中。处理速度会大大降低。我想知道是否有任何方法可以更快或更优化地找到路口/公共物品。预先谢谢你

2 个答案:

答案 0 :(得分:0)

您可以通过将所有列表预先设置为集合,并且不进行冗余检查来节省大量的时间(从某种意义上说,对于列表[A, B, C],您当前的代码可以有效地同时检查A intersect BB intersect A)。
您可以利用itertools.combinations生成所有可能的组合。

from itertools import combinations
dictionary = {'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1], 'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17], 'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3], 'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18], 'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13], 'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10], 'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7], 'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]}
dict_of_sets = {k:set(v) for k,v in dictionary.items()}
clusterDict = {}

for (key1, value1), (key2, value2) in combinations(dict_of_sets.items(),2):
    intersect = value1.intersection(value2)
    if len(intersect) >= 5:
        #change keyword tuple to str if you wish to. 
        clusterDict.setdefault(tuple(sorted(intersect)),[]).extend([key1, key2])

请注意,您还可以使用元组作为字典键,这在我看来至少比将列表类型转换为字符串更干净。但是,请随时根据需要更改该部分。

这应该更快,但是随着这些事情的进行,令人遗憾的是,这仍然是O(N^2)复杂性解决方案。我不知道一种进一步降低复杂性的方法。

答案 1 :(得分:0)

如果我了解您要执行的操作,那么我认为这要复杂得多。最后,您需要遍历所有可能的值交集的lattice(请参阅this figure以了解我的意思)。我为您的问题编写了以下函数:

def findClusters(dictionary, minSize):
    # Make a list with the initial pairs of set and name
    # Since two names may have all the same values each item is
    # a set of values and a set of names
    setList = {}
    for k, v in dictionary.items():
        if len(v) >= minSize:
            v = frozenset(v)
            setList.setdefault(v, set()).add(k)
    setList = list(setList.items())
    # Build the clusters dictionary
    clusterDict = {}
    # Iterate the list values and names
    for i, (s, k) in enumerate(setList):
        if len(k) > 1:
            # This happens if two names have the same values,
            # in which case that is already a cluster
            clusterDict.setdefault(s, set()).update(k)
        # This is the list of "open" lattice nodes
        open = [(s, k)]
        # This is the list of lists of continuations for each lattice node
        # Initially a node can be followed by any of the nodes after it in setList
        follows = [setList[i + 1:]]
        # While there are open nodes
        while open and follows:
            # Get the current node and its possible continuations
            (s1, k1), *open = open
            follow, *follows = follows
            # For each continuation
            for j, (s2, k2) in enumerate(follow):
                # Get the intersection of values of this node and the continuation
                s = s1.intersection(s2)
                # Only continue if it is big enough
                if len(s) >= minSize:
                    # Set of names for the node plus the continuation
                    k = k1.union(k2)
                    # Add the names to the cluster in the dictionary
                    clusterDict.setdefault(s, set()).update(k)
                    # Add the new node to the open list
                    open.append((s, k))
                    # The continuations for the new node are all the continuations after this one
                    follows.append(follow[j + 1:])
    return clusterDict

一个小例子:

dictionary = {
    'A': [1, 2, 3, 4],
    'B': [1, 2, 3],
    'C': [1, 2, 3, 4],
    'D': [1, 4],
}
minSize = 2
print(*findClusters(dictionary, minSize).items(), sep='\n')

输出:

(frozenset({1, 2, 3, 4}), {'C', 'A'})
(frozenset({1, 2, 3}), {'C', 'A', 'B'})
(frozenset({1, 4}), {'C', 'D', 'A'})

带有问题中的数据:

dictionary = {
    'Mary': [7, 0, 19, 19, 9, 18, 8, 11, 6, 1],
    'John': [0, 6, 7, 9, 18, 2, 4, 5, 13, 17],
    'Paul': [17, 12, 18, 16, 9, 5, 6, 7, 0, 3],
    'Joe': [4, 15, 2, 8, 3, 0, 6, 7, 9, 18],
    'Peter': [5, 3, 10, 2, 4, 16, 7, 6, 15, 13],
    'Maggie': [13, 6, 5, 4, 8, 9, 7, 18, 11, 10],
    'Ken': [2, 18, 16, 6, 0, 17, 4, 15, 11, 7],
    'Roger': [3, 1, 16, 4, 13, 14, 19, 11, 8, 0]
}
minSize = 5
print(*findClusters(dictionary, minSize).items(), sep='\n')

输出:

(frozenset({0, 6, 7, 9, 18}), {'Mary', 'Paul', 'John', 'Joe'})
(frozenset({0, 6, 7, 8, 9, 18}), {'Joe', 'Mary'})
(frozenset({6, 7, 8, 9, 11, 18}), {'Maggie', 'Mary'})
(frozenset({0, 6, 7, 11, 18}), {'Ken', 'Mary'})
(frozenset({0, 1, 8, 11, 19}), {'Roger', 'Mary'})
(frozenset({6, 7, 8, 9, 18}), {'Maggie', 'Joe', 'Mary'})
(frozenset({0, 5, 6, 7, 9, 17, 18}), {'Paul', 'John'})
(frozenset({0, 2, 4, 6, 7, 9, 18}), {'John', 'Joe'})
(frozenset({2, 4, 5, 6, 7, 13}), {'Peter', 'John'})
(frozenset({4, 5, 6, 7, 9, 13, 18}), {'Maggie', 'John'})
(frozenset({0, 2, 4, 6, 7, 17, 18}), {'Ken', 'John'})
(frozenset({5, 6, 7, 9, 18}), {'Maggie', 'Paul', 'John'})
(frozenset({0, 6, 7, 17, 18}), {'Paul', 'Ken', 'John'})
(frozenset({4, 6, 7, 9, 18}), {'Maggie', 'John', 'Joe'})
(frozenset({0, 2, 4, 6, 7, 18}), {'Ken', 'John', 'Joe'})
(frozenset({4, 5, 6, 7, 13}), {'Maggie', 'Peter', 'John'})
(frozenset({0, 3, 6, 7, 9, 18}), {'Paul', 'Joe'})
(frozenset({3, 5, 6, 7, 16}), {'Paul', 'Peter'})
(frozenset({0, 6, 7, 16, 17, 18}), {'Paul', 'Ken'})
(frozenset({2, 3, 4, 6, 7, 15}), {'Peter', 'Joe'})
(frozenset({4, 6, 7, 8, 9, 18}), {'Maggie', 'Joe'})
(frozenset({0, 2, 4, 6, 7, 15, 18}), {'Ken', 'Joe'})
(frozenset({2, 4, 6, 7, 15}), {'Peter', 'Ken', 'Joe'})
(frozenset({4, 5, 6, 7, 10, 13}), {'Maggie', 'Peter'})
(frozenset({2, 4, 6, 7, 15, 16}), {'Peter', 'Ken'})
(frozenset({4, 6, 7, 11, 18}), {'Maggie', 'Ken'})