d = {'g1':{'p1':1,'p2':5,'p3':11,'p4':1},
'g2':{'p1':7,'p3':1,'p4':2,'p5':8,'p9':11},
'g3':{'p7':7,'p8':7},
'g4':{'p8':9,'p9':1,'p10':7,'p11':8,'p12':3},
'g5':{'p1':4,'p13':1},
'g6':{'p1':4,'p3':1,'p6':2,'p13':1}
}
对于给定的字典'd',我想返回至少共享两个('n')键的子字典集群(存在于给定集群的所有子字典中)。我们在这里并不关心这些子词典的价值。换句话说,给定集群中所有子字典的键交叉长度应至少为2(或'n')。
答案 0 :(得分:2)
我希望我能正确理解你想要的东西。这种方法很笨拙,我担心它效率很低。
我在d中添加了字典g6,以便产生更有趣的输出:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
d = {'g1':{'p1':1,'p2':5,'p3':11,'p4':1},
'g2':{'p1':7,'p3':1,'p4':2,'p5':8,'p9':11},
'g3':{'p7':7,'p8':7},
'g4':{'p8':9,'p9':1,'p10':7,'p11':8,'p12':3},
'g5':{'p1':4,'p13':1},
'g6':{'p1':1,'p9':2,'p11':12}
}
clusters = {}
for key, value in d.items ():
cluster = frozenset (value.keys () )
if cluster not in clusters: clusters [cluster] = set ()
clusters [cluster].add (key)
for a in clusters.keys ():
for b in clusters.keys ():
if len (a & b) > 1 and a ^ b:
cluster = frozenset (a & b)
if cluster not in clusters: clusters [cluster] = set ()
for x in clusters [a]: clusters [cluster].add (x)
for x in clusters [b]: clusters [cluster].add (x)
print "Primitive clusters"
for key, value in filter (lambda (x, y): len (y) == 1, clusters.items () ):
print "The dictionary %s has the keys %s" % (value.pop (), ", ".join (key) )
print "---------------------"
print "Non-primitive clusters:"
for key, value in filter (lambda (x, y): len (y) > 1, clusters.items () ):
print "The dictionaries %s share the keys %s" % (", ".join (value), ", ".join (key) )
答案 1 :(得分:0)
像
这样的东西for keya in d:
tempd = {}
keys = set()
tempset = set(d[keya].keys())
for keyb in d:
tempset &= d[keyb].keys()
if len(tempset) >= 2:
keys.add(keyb)
print({key: d[key] for key in keys})
可能会工作。
编辑:不,它没有用。我需要考虑一下。答案 2 :(得分:0)
我认为你应该首先“反转”字典,然后找到解决方案很简单:
import collections
inverted = collections.defaultdict(list)
for key, items in d.items():
for sub_key in items:
inverted[sub_key].append(key)
for sub_key, keys in inverted.items():
if len(keys) >= 2:
print sub_key, keys
答案 3 :(得分:0)
如果将问题简化为只有长度为2的聚类(即字典对),则会变得更加清晰:从给定的可迭代生成固定长度的子序列正是itertools.combinations的工作:
>>> list(itertools.combinations(d, 2))
[('g5', 'g4'), ('g5', 'g3'), ('g5', 'g2'), ('g5', 'g1'), ('g4', 'g3'), ('g4', 'g
2'), ('g4', 'g1'), ('g3', 'g2'), ('g3', 'g1'), ('g2', 'g1')]
通过认识到视图d.keys()的行为类似于一个集合(在Python 3中;在Python 2中,它可能是一个列表),我们可以看到任何字典共有的键的数量:
>>> d['g1'].keys() & d['g2'].keys()
{'p3', 'p1', 'p4'}
&安培;是集合交集算子 - 它为我们提供了这些集合共有的所有项目的集合。因此,我们可以通过检查这个集合的长度来检查其中至少有两个,这给了我们:
>>> common_pairs = [[x,y] for x,y in itertools.combinations(d, 2)
if len(d[x].keys() & d[y].keys()) >= 2]
>>> common_pairs
[['g2', 'g1']]
解决未知的簇大小稍微困难 - 我们不能使用&如果我们没有硬编码,直接运营商。值得庆幸的是,set类为我们提供了一种方法,以set.intersection的形式获取 n 集的交集。它不会接受dict_keys实例,但你可以通过调用set来轻松解决这个问题:
>>> set.intersection(d['g1'].keys(), d['g2'].keys(), d['g5'].keys())
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: descriptor 'intersection' requires a 'set' object but received a 'dict_keys'
>>> set.intersection(set(d['g1']), set(d['g1']), set(d['g5']))
{'p1'}
您应该能够将此概括为大小为2到 n 的群集。