如果我有多个列表如下
list1 = ['one', 'three', 'four', 'six', 'seven', 'nine', 'zero']
list2 = ['two', 'four', 'five', 'six', 'eight', 'ten']
list3 = ['one', 'two', 'zero', 'three', 'seven']
list4 = ['four', 'five', 'six', 'eight', 'ten']
list5 = ['zero', 'one', 'three', 'four', 'seven', 'ten']
list6 = ['one', 'two']
并且所有元素都由一些固定项目组成,如下所示
list_main = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
问题是:我需要根据事件对列表进行分类,因此结果将是这样的
list2, list4 -- because they all have same 5 items ('four', 'five', 'ten', 'six', 'eight')
list1, list3, list5 -- because they all have same 4 items ('zero', 'one', 'three', 'seven')
编辑:就像上面提到的例子一样
我希望将最匹配的列表放在一起,并将它们从下一个循环中排除
当我再次运行该过程时,它将查看剩余的列表并找到最匹配的列表并将它们从下一个循环中排除,依此类推,直到没有剩余的列表。
修改: @padraic soulution是最好的,这里是完整的代码(包括所需的导入),谢谢@padraic
from collections import defaultdict
from copy import deepcopy
from operator import itemgetter
def srt(args):
for ind, sub in enumerate(args, 1):
sub.sort()
yield ind, sub
list1 = ['one', 'three', 'four', 'six', 'seven', 'nine', 'zero']
list2 = ['two', 'four', 'five', 'six', 'eight', 'ten']
list3 = ['one', 'two', 'zero', 'three', 'seven']
list4 = ['four', 'five', 'six', 'eight', 'ten']
list5 = ['zero', 'one', 'three', 'four', 'seven', 'ten']
list6 = ['one', 'two']
d = defaultdict(defaultdict)
orig = [list1, list2, list3, list4, list5, list6]
all_best = defaultdict(int)
subs = sorted(srt(deepcopy(orig)), key=itemgetter(1))
for ind, ele in subs:
best, partner = None, None
for i2, ele2 in subs:
if ind == i2:
continue
_int = len(set(ele).intersection(ele2))
if best is None or best < _int:
best = _int
partner = i2
if all_best[ind] < best:
all_best[ind] = best
d[ind][partner] = best
d[partner][ind] = best
grouped = []
used = set()
for k, v in (d.items()):
if all(val == all_best[_k] for _k, val in v.items()):
best = [k] + list(v)
if not any(s in used for s in best):
grouped.append(best)
used.update(best)
print(grouped)
print([[orig[ind - 1] for ind in grp] for grp in grouped])
答案 0 :(得分:1)
这将告诉你list1和list2共有多少项:
len(set(list1).intersection(list2))
您可以执行以下操作:
from collections import defaultdict
intersectionctions = defaultdict(set)
lists = [
[1, 2, 3],
[2, 3, 4],
# and so on
]
for i in range(len(lists) - 1):
for j in range(i + 1, len(lists)):
intsec = set(lists[i]).intersection(lists[j])
intersections[tuple(sorted(intsec))].add(lists[i])
intersections[tuple(sorted(intsec))].add(lists[i])
这将建立一个包含列表之间所有共性的数据库。 您可以根据交叉点中键的长度(即共享元素的数量)对其进行排序,但是一旦您开始排除某些内容,因为您之前在另一个上下文中看到它,您就会根据任意(?)排序做出决策。例如,交叉口中可能有几个不同的密钥长度为五。您应该保留哪个,哪些应该删除,因为它们是重复的?
答案 1 :(得分:0)
最好的开始方式是对所有子列表进行排序并对每个匹配进行排名:
from copy import deepcopy
def srt(args):
for ind, sub in enumerate(args, 1):
sub.sort()
yield ind, sub
d = defaultdict(defaultdict)
orig = [list1, list2, list3, list4, list5, list6]
all_best = defaultdict(int)
subs = sorted(srt(deepcopy(orig)), key=itemgetter(1))
for ind, ele in subs:
best, partner = None, None
for i2, ele2 in subs:
if ind == i2:
continue
_int = len(set(ele).intersection(ele2))
if best is None or best < _int:
best = _int
partner = i2
if all_best[ind] < best:
all_best[ind] = best
d[ind][partner] = best
d[partner][ind] = best
grouped = []
used = set()
for k, v in (d.items()):
if all(val == all_best[_k] for _k, val in v.items()):
best = [k] + list(v)
if not any(s in used for s in best):
grouped.append(best)
used.update(best)
print(grouped)
print([[orig[ind-1] for ind in grp] for grp in grouped])
输出:
[[1, 3, 5], [2, 4]]
[[['one', 'three', 'four', 'six', 'seven', 'nine', 'zero'],
['one', 'two', 'zero', 'three', 'seven'],
['zero', 'one', 'three', 'four', 'seven', 'ten']],
['four', 'five', 'six', 'eight', 'ten']]]
答案 2 :(得分:0)
这样的事情
>>> my_lists=[ list1, list2, list3, list4, list5, list6 ]
>>> my_list_of_set=[set(x) for x in my_lists ]
>>> my_list_of_set.sort(key=len)
>>> result=[]
>>> while my_list_of_set:
elem = my_list_of_set.pop()
max_inter=max(map(lambda x:x&elem, my_list_of_set), key=len, default=set())
inter = [ x for x in my_list_of_set if elem & x == max_inter]
for x in inter:
my_list_of_set.remove(x)
inter.append(elem)
result.append( (max_inter,inter) )
>>> for x in result:
print("the matching elem:",x[0])
print("the lists:",x[1])
print()
the matching elem: {'three', 'one', 'zero', 'seven', 'four'}
the lists: [{'one', 'seven', 'four', 'three', 'ten', 'zero'}, {'one', 'nine', 'seven', 'four', 'three', 'six', 'zero'}]
the matching elem: {'five', 'six', 'ten', 'eight', 'four'}
the lists: [{'five', 'six', 'ten', 'eight', 'four'}, {'four', 'two', 'five', 'six', 'ten', 'eight'}]
the matching elem: {'one', 'two'}
the lists: [{'one', 'two'}, {'seven', 'three', 'one', 'zero', 'two'}]
>>>
首先制作所有列表集,以便您可以使用交叉点,然后将其保存在新列表中,按长度排序列表并按星形排序最长的一个并搜索与其他列表的最大交集点,然后列出一个列表所有具有相同最大交集的元素并删除它们并保存结果
修改强>
这是解决问题的另一种方法
import operator as op
from itertools import chain, combinations
from functools import reduce
from pprint import pprint
from collections import namedtuple
def powerset_since_n(iterable,n=0):
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(n,len(s)+1))
Groups = namedtuple("Groups","sublist common_elem") # for esthetic reasons
def get_common_groups(ListOfList) -> "[ (indexs,common_elements) ]":
"""from the given list of list return a tuple with the index of the sublists that
have the maximum amount of common elements across the maximum amount of sublists
and a set of the common elements between them"""
my_dict = {i:set(x) for i,x in enumerate(ListOfList)}
order = sorted( my_dict,key=lambda x:len(my_dict[x]))
all_inters = dict()
#calculate all posibles not empty intersections of 2 or more sets
for grupo in powerset_since_n(order,2):
temp = reduce(op.and_, map(my_dict.get, grupo) )
if temp:
all_inters[frozenset(grupo)] = temp #the frozenset is to make order irrelevant
result=[]
order_set = set(order) # make this set so don't have to calculate it each time in the filter below
while order:
elem = order.pop()
order_set.remove(elem)
grupo = max( filter(lambda k: elem in k and k&order_set, all_inters)
,key = lambda kk: 2*len(kk) + len(all_inters[kk]) #this is what decide who is the max
,default=frozenset() # the empty case
)
for x in grupo: #remove the selects one
if x in order_set:
order.remove(x)
order_set.remove(x)
grupo_final = set(grupo)
grupo_final.add(elem) # for the case in that the grupo is empty
result.append( Groups(grupo_final,all_inters.get(grupo,set())) )
return result
这次首先我计算给定列表列表的幂集,然后从最长列表开始,我决定哪个是涉及它的最大组,剩下的元素是通过添加涉及多少列表以及如何他们有许多共同的元素,并选择其中包含更多列表的元素,我对列表的数量给予了额外的权重,让 max 完成它的工作,其余的是自我解释的。
这是一个快速测试
>>> my_lists=[ list1, list2, list3, list4, list5, list6 ]
>>> pprint(get_common_groups(my_lists))
[Groups(sublist={0, 2, 4}, common_elem={'seven', 'zero', 'one', 'three'}),
Groups(sublist={1, 3}, common_elem={'ten', 'eight', 'five', 'four', 'six'}),
Groups(sublist={5}, common_elem=set())]
>>> my_lists[0]
['one', 'three', 'four', 'six', 'seven', 'nine', 'zero']
>>> my_lists[1]
['two', 'four', 'five', 'six', 'eight', 'ten']
>>> my_lists[5]
['one', 'two']
>>>
答案 3 :(得分:0)
首先,您需要将列表包装在另一个列表中:
lists = [['one', 'three', 'four', 'six', 'seven', 'nine', 'zero'],
['two', 'four', 'five', 'six', 'eight', 'ten'], ...]
然后,您需要对lists
列表中的两个列表的所有组合执行操作。 itertools
模块可以帮助您:
import itertools
for pair in itertools.combinations(lists, r=2):
# each pair is a tuple of lists ([], [])
然后你可以用相交来做东西来选择最匹配的列表。 您希望获得其中包含大多数元素的交集。完成的事情看起来像这样:
import itertools
#Use your lists. I used those to test.
lists = [[1, 2, 3, 4],
[2, 3, 4, 5, 6],
[3, 4, 5, 1],
[8, 5, 6, 9, 2, 4]]
#Intersects a pair of lists and builds the dictionary
def intersect_pair(pair):
return { 'intersection': set(pair[0]).intersection(pair[1]),
'initial_pair': pair }
#Determines the intersection size for the 'max' function
def intersect_size(intersection):
return len(intersection['intersection'])
#Intersects every pair and selects the largest intersection
pairs = [intersect_pair(pair) for pair in itertools.combinations(lists, r=2)]
most_matching_pair = max(pairs, key=intersect_size)
#This is the dictionary containing a pair of two most matching lists and their intersection.
print(most_matching_pair)
答案 4 :(得分:-1)
移植到python3:
from collections import OrderedDict
from functools import reduce
list1 = ['one', 'three', 'four', 'six', 'seven', 'nine', 'zero']
list2 = ['two', 'four', 'five', 'six', 'eight', 'ten']
list3 = ['one', 'two', 'zero', 'three', 'seven']
list4 = ['four', 'five', 'six', 'eight', 'ten']
list5 = ['zero', 'one', 'three', 'four', 'seven', 'ten']
list6 = ['one', 'two']
def varname(var):
import inspect
frame = inspect.currentframe()
var_id = id(var)
for name in frame.f_back.f_back.f_back.f_locals.keys():
try:
if id(eval(name)) == var_id:
return(name)
except:
pass
size=len([list1,list2,list3,list4,list5,list6])
s={}
l=['{varname(list%d):set(list%d)}'%(i,j) for i, j in zip(range(1,size+1),range(1,size+1))]
d=reduce(lambda x, y: {**eval(f'{x}'), **eval(f'{y}')}, l)
for k1, a in d.items():
for k2, b in d.items():
if k1!=k2 and not (k2, k1) in s.keys() and not (k1, k2) in s.keys():
if int(k1[4])>int(k2[4]):
s[(k2, k1)]=len(a.intersection(b))
else:
s[(k1, k2)]=len(a.intersection(b))
x=OrderedDict()
for k, v in s.items():
x[v]=x.get(v,[])+[k]
print (v, ": ", k)
x=OrderedDict(sorted(x.items()))
for k, v in x.items():
for tpl in v:
print (k, "times: %s intersects with "%str(tpl), ":", list(eval('set(%s)'%tpl[0]).intersection(eval('set(%s)'%tpl[1]))))
列表l应为:
['{varname(list1):set(list1)}',
'{varname(list2):set(list2)}',
'{varname(list3):set(list3)}',
'{varname(list4):set(list4)}',
'{varname(list5):set(list5)}',
'{varname(list6):set(list6)}']
dict d应该是:
{'list1': {'four', 'nine', 'one', 'seven', 'six', 'three', 'zero'},
'list2': {'eight', 'five', 'four', 'six', 'ten', 'two'},
'list3': {'one', 'seven', 'three', 'two', 'zero'},
'list4': {'eight', 'five', 'four', 'six', 'ten'},
'list5': {'four', 'one', 'seven', 'ten', 'three', 'zero'},
'list6': {'one', 'two'}}
我们有交叉列表:
{('list1', 'list2'): 2,
('list1', 'list3'): 4,
('list1', 'list4'): 2,
('list1', 'list5'): 5,
('list1', 'list6'): 1,
('list2', 'list3'): 1,
('list2', 'list4'): 5,
('list2', 'list5'): 2,
('list2', 'list6'): 1,
('list3', 'list4'): 0,
('list3', 'list5'): 4,
('list3', 'list6'): 2,
('list4', 'list5'): 2,
('list4', 'list6'): 0,
('list5', 'list6'): 1}
x然后是键/值中的s是s中的一组键值,然后按顺序排序:
OrderedDict([(0, [('list3', 'list4'), ('list4', 'list6')]),
(1,
[('list1', 'list6'),
('list2', 'list3'),
('list2', 'list6'),
('list5', 'list6')]),
(2,
[('list1', 'list2'),
('list1', 'list4'),
('list2', 'list5'),
('list3', 'list6'),
('list4', 'list5')]),
(4, [('list1', 'list3'), ('list3', 'list5')]),
(5, [('list1', 'list5'), ('list2', 'list4')])])
最终印刷结果印刷得很漂亮:
0 times: ('list3', 'list4') intersects with : []
0 times: ('list4', 'list6') intersects with : []
1 times: ('list1', 'list6') intersects with : ['one']
1 times: ('list2', 'list3') intersects with : ['two']
1 times: ('list2', 'list6') intersects with : ['two']
1 times: ('list5', 'list6') intersects with : ['one']
2 times: ('list1', 'list2') intersects with : ['four', 'six']
2 times: ('list1', 'list4') intersects with : ['four', 'six']
2 times: ('list2', 'list5') intersects with : ['four', 'ten']
2 times: ('list3', 'list6') intersects with : ['one', 'two']
2 times: ('list4', 'list5') intersects with : ['four', 'ten']
4 times: ('list1', 'list3') intersects with : ['seven', 'zero', 'one', 'three']
4 times: ('list3', 'list5') intersects with : ['seven', 'zero', 'one', 'three']
5 times: ('list1', 'list5') intersects with : ['seven', 'one', 'four', 'three', 'zero']
5 times: ('list2', 'list4') intersects with : ['five', 'four', 'six', 'eight', 'ten']