考虑一些整数列表:
#--------------------------------------
0 [0,1,3]
1 [1,0,3,4,5,10,...]
2 [2,8]
3 [3,1,0,...]
...
n []
#--------------------------------------
问题是合并具有至少一个共同元素的列表。因此,仅针对给定部分的结果如下:
#--------------------------------------
0 [0,1,3,4,5,10,...]
2 [2,8]
#--------------------------------------
对大数据执行此操作的最有效方法是什么(元素只是数字)?
tree
结构需要考虑吗?
我现在通过将列表转换为sets
并迭代交叉点来完成工作,但它很慢!而且我有一种如此初级的感觉!此外,实现缺少一些东西(未知),因为有些列表有时会保持未合并!话虽如此,如果你提出自我实现,请慷慨并提供一个简单的示例代码[显然 Python 是我的偏爱:)]或pesudo代码。
更新1:
这是我正在使用的代码:
#--------------------------------------
lsts = [[0,1,3],
[1,0,3,4,5,10,11],
[2,8],
[3,1,0,16]];
#--------------------------------------
该功能是(越野车!! ):
#--------------------------------------
def merge(lsts):
sts = [set(l) for l in lsts]
i = 0
while i < len(sts):
j = i+1
while j < len(sts):
if len(sts[i].intersection(sts[j])) > 0:
sts[i] = sts[i].union(sts[j])
sts.pop(j)
else: j += 1 #---corrected
i += 1
lst = [list(s) for s in sts]
return lst
#--------------------------------------
结果是:
#--------------------------------------
>>> merge(lsts)
>>> [0, 1, 3, 4, 5, 10, 11, 16], [8, 2]]
#--------------------------------------
更新2: 根据我的经验,下面的 Niklas Baumstark 给出的代码显示对于简单的情况来说要快一点。尚未测试“Hooked”给出的方法,因为它是完全不同的方法(顺便说一下它似乎很有趣)。 所有这些的测试程序可能非常难以或无法确保结果。我将使用的真实数据集是如此之大和复杂,因此不可能仅通过重复来追踪任何错误。这就是我需要100%满足方法的可靠性,然后才能将它作为模块在大型代码中推送到位。现在只需 Niklas 的方法更快,当然简单套装的答案也是正确的 但是,我怎么能确定它适用于真正的大型数据集呢?因为我无法直观地追踪错误!
更新3: 请注意,该方法的可靠性比此问题的速度重要得多。我希望能够最终将Python代码转换为Fortran以获得最佳性能。
更新4:
这篇文章中有许多有趣的观点,并慷慨地给出答案,建设性意见。我建议你仔细阅读。请接受我对问题的发展,惊人的答案以及建设性的评论和讨论的赞赏。
答案 0 :(得分:21)
我的尝试:
def merge(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = True
while merged:
merged = False
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = True
common |= x
results.append(common)
sets = results
return sets
lst = [[65, 17, 5, 30, 79, 56, 48, 62],
[6, 97, 32, 93, 55, 14, 70, 32],
[75, 37, 83, 34, 9, 19, 14, 64],
[43, 71],
[],
[89, 49, 1, 30, 28, 3, 63],
[35, 21, 68, 94, 57, 94, 9, 3],
[16],
[29, 9, 97, 43],
[17, 63, 24]]
print merge(lst)
基准:
import random
# adapt parameters to your own usage scenario
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
if False: # change to true to generate the test data file (takes a while)
with open("/tmp/test.txt", "w") as f:
lists = []
classes = [
range(class_size * i, class_size * (i + 1)) for i in range(class_count)
]
for c in classes:
# distribute each class across ~300 lists
for i in xrange(list_count_per_class):
lst = []
if random.random() < large_list_probability:
size = random.choice(large_list_sizes)
else:
size = random.choice(small_list_sizes)
nums = set(c)
for j in xrange(size):
x = random.choice(list(nums))
lst.append(x)
nums.remove(x)
random.shuffle(lst)
lists.append(lst)
random.shuffle(lists)
for lst in lists:
f.write(" ".join(str(x) for x in lst) + "\n")
setup = """
# Niklas'
def merge_niklas(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = 1
while merged:
merged = 0
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = 1
common |= x
results.append(common)
sets = results
return sets
# Rik's
def merge_rik(data):
sets = (set(e) for e in data if e)
results = [next(sets)]
for e_set in sets:
to_update = []
for i, res in enumerate(results):
if not e_set.isdisjoint(res):
to_update.insert(0, i)
if not to_update:
results.append(e_set)
else:
last = results[to_update.pop(-1)]
for i in to_update:
last |= results[i]
del results[i]
last |= e_set
return results
# katrielalex's
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
import networkx
def merge_katrielalex(lsts):
g = networkx.Graph()
for lst in lsts:
for edge in pairs(lst):
g.add_edge(*edge)
return networkx.connected_components(g)
# agf's (optimized)
from collections import deque
def merge_agf_optimized(lists):
sets = deque(set(lst) for lst in lists if lst)
results = []
disjoint = 0
current = sets.pop()
while True:
merged = False
newsets = deque()
for _ in xrange(disjoint, len(sets)):
this = sets.pop()
if not current.isdisjoint(this):
current.update(this)
merged = True
disjoint = 0
else:
newsets.append(this)
disjoint += 1
if sets:
newsets.extendleft(sets)
if not merged:
results.append(current)
try:
current = newsets.pop()
except IndexError:
break
disjoint = 0
sets = newsets
return results
# agf's (simple)
def merge_agf_simple(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
# alexis'
def merge_alexis(data):
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [m for m in data if m]
return have
def locatebin(bins, n):
while bins[n] != n:
n = bins[n]
return n
lsts = []
size = 0
num = 0
max = 0
for line in open("/tmp/test.txt", "r"):
lst = [int(x) for x in line.split()]
size += len(lst)
if len(lst) > max:
max = len(lst)
num += 1
lsts.append(lst)
"""
setup += """
print "%i lists, {class_count} equally distributed classes, average size %i, max size %i" % (num, size/num, max)
""".format(class_count=class_count)
import timeit
print "niklas"
print timeit.timeit("merge_niklas(lsts)", setup=setup, number=3)
print "rik"
print timeit.timeit("merge_rik(lsts)", setup=setup, number=3)
print "katrielalex"
print timeit.timeit("merge_katrielalex(lsts)", setup=setup, number=3)
print "agf (1)"
print timeit.timeit("merge_agf_optimized(lsts)", setup=setup, number=3)
print "agf (2)"
print timeit.timeit("merge_agf_simple(lsts)", setup=setup, number=3)
print "alexis"
print timeit.timeit("merge_alexis(lsts)", setup=setup, number=3)
这些时间显然取决于基准测试的具体参数,例如类的数量,列表的数量,列表大小等。根据您的需要调整这些参数以获得更有用的结果。
以下是我的机器上针对不同参数的一些示例输出。他们表明所有算法都有其优点和缺点,具体取决于他们得到的输入类型:
=====================
# many disjoint classes, large lists
class_count = 50
class_size = 1000
list_count_per_class = 100
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
=====================
niklas
5000 lists, 50 equally distributed classes, average size 298, max size 999
4.80084705353
rik
5000 lists, 50 equally distributed classes, average size 298, max size 999
9.49251699448
katrielalex
5000 lists, 50 equally distributed classes, average size 298, max size 999
21.5317108631
agf (1)
5000 lists, 50 equally distributed classes, average size 298, max size 999
8.61671280861
agf (2)
5000 lists, 50 equally distributed classes, average size 298, max size 999
5.18117713928
=> alexis
=> 5000 lists, 50 equally distributed classes, average size 298, max size 999
=> 3.73504281044
===================
# less number of classes, large lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.5
===================
niklas
4500 lists, 15 equally distributed classes, average size 296, max size 999
1.79993700981
rik
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.58237695694
katrielalex
4500 lists, 15 equally distributed classes, average size 296, max size 999
19.5465381145
agf (1)
4500 lists, 15 equally distributed classes, average size 296, max size 999
2.75445604324
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 296, max size 999
=> 1.77850699425
alexis
4500 lists, 15 equally distributed classes, average size 296, max size 999
3.23530197144
===================
# less number of classes, smaller lists
class_count = 15
class_size = 1000
list_count_per_class = 300
large_list_sizes = list(range(100, 1000))
small_list_sizes = list(range(0, 100))
large_list_probability = 0.1
===================
niklas
4500 lists, 15 equally distributed classes, average size 95, max size 997
0.773697137833
rik
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.0523750782
katrielalex
4500 lists, 15 equally distributed classes, average size 95, max size 997
6.04466891289
agf (1)
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.20285701752
=> agf (2)
=> 4500 lists, 15 equally distributed classes, average size 95, max size 997
=> 0.714507102966
alexis
4500 lists, 15 equally distributed classes, average size 95, max size 997
1.1286110878
答案 1 :(得分:13)
我试图在这个问题和duplicate one中总结一下有关这个主题的所有内容。
我尝试测试和时间每个解决方案(所有代码here)。
这是测试模块中的TestCase
:
class MergeTestCase(unittest.TestCase):
def setUp(self):
with open('./lists/test_list.txt') as f:
self.lsts = json.loads(f.read())
self.merged = self.merge_func(deepcopy(self.lsts))
def test_disjoint(self):
"""Check disjoint-ness of merged results"""
from itertools import combinations
for a,b in combinations(self.merged, 2):
self.assertTrue(a.isdisjoint(b))
def test_coverage(self): # Credit to katrielalex
"""Check coverage original data"""
merged_flat = set()
for s in self.merged:
merged_flat |= s
original_flat = set()
for lst in self.lsts:
original_flat |= set(lst)
self.assertTrue(merged_flat == original_flat)
def test_subset(self): # Credit to WolframH
"""Check that every original data is a subset"""
for lst in self.lsts:
self.assertTrue(any(set(lst) <= e for e in self.merged))
这个测试假设了一个集合列表,因此我无法测试一些与列表一起使用的漏洞。
我无法测试以下内容:
katrielalex
steabert
在我可以测试的那些中,两个失败:
-- Going to test: agf (optimized) --
Check disjoint-ness of merged results ... FAIL
-- Going to test: robert king --
Check disjoint-ness of merged results ... FAIL
这些表现与所采用的数据测试密切相关。
到目前为止,有三个答案试图为他们和他人解决问题。由于他们使用不同的测试数据,他们有不同的结果。
Niklas benchmark非常适合。用他的banchmark可以做一些不同的测试来改变一些参数。
我使用了他在自己的答案中使用的相同的三组参数,并将它们放在三个不同的文件中:
filename = './lists/timing_1.txt'
class_count = 50,
class_size = 1000,
list_count_per_class = 100,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.5,
filename = './lists/timing_2.txt'
class_count = 15,
class_size = 1000,
list_count_per_class = 300,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.5,
filename = './lists/timing_3.txt'
class_count = 15,
class_size = 1000,
list_count_per_class = 300,
large_list_sizes = (100, 1000),
small_list_sizes = (0, 100),
large_list_probability = 0.1,
这是我得到的结果:
来自档案:timing_1.txt
Timing with: >> Niklas << Benchmark
Info: 5000 lists, average size 305, max size 999
Timing Results:
10.434 -- alexis
11.476 -- agf
11.555 -- Niklas B.
13.622 -- Rik. Poggi
14.016 -- agf (optimized)
14.057 -- ChessMaster
20.208 -- katrielalex
21.697 -- steabert
25.101 -- robert king
76.870 -- Sven Marnach
133.399 -- hochl
来自档案:timing_2.txt
Timing with: >> Niklas << Benchmark
Info: 4500 lists, average size 305, max size 999
Timing Results:
8.247 -- Niklas B.
8.286 -- agf
8.637 -- Rik. Poggi
8.967 -- alexis
9.090 -- ChessMaster
9.091 -- agf (optimized)
18.186 -- katrielalex
19.543 -- steabert
22.852 -- robert king
70.486 -- Sven Marnach
104.405 -- hochl
来自档案:timing_3.txt
Timing with: >> Niklas << Benchmark
Info: 4500 lists, average size 98, max size 999
Timing Results:
2.746 -- agf
2.850 -- Niklas B.
2.887 -- Rik. Poggi
2.972 -- alexis
3.077 -- ChessMaster
3.174 -- agf (optimized)
5.811 -- katrielalex
7.208 -- robert king
9.193 -- steabert
23.536 -- Sven Marnach
37.436 -- hochl
使用Sven的测试数据,我得到了以下结果:
Timing with: >> Sven << Benchmark
Info: 200 lists, average size 10, max size 10
Timing Results:
2.053 -- alexis
2.199 -- ChessMaster
2.410 -- agf (optimized)
3.394 -- agf
3.398 -- Rik. Poggi
3.640 -- robert king
3.719 -- steabert
3.776 -- Niklas B.
3.888 -- hochl
4.610 -- Sven Marnach
5.018 -- katrielalex
最后以Agf为基准,我得到了:
Timing with: >> Agf << Benchmark
Info: 2000 lists, average size 246, max size 500
Timing Results:
3.446 -- Rik. Poggi
3.500 -- ChessMaster
3.520 -- agf (optimized)
3.527 -- Niklas B.
3.527 -- agf
3.902 -- hochl
5.080 -- alexis
15.997 -- steabert
16.422 -- katrielalex
18.317 -- robert king
1257.152 -- Sven Marnach
正如我在开头所说,所有代码都可以在this git repository获得。所有合并函数都在一个名为core.py
的文件中,其中名称以_merge
结尾的每个函数都将在测试期间自动加载,因此添加/测试/改进您的文件应该不难自己的解决方案。
让我也知道是否有什么问题,这是很多编码,我可以使用一些新鲜的眼睛:)
答案 2 :(得分:7)
让我以下面的评论为这个答案做准备:
这是错误的做法。它是数字不稳定的,并且比其他方法更快,自行承担使用风险。
话虽如此,我无法抗拒从动态角度解决问题(我希望你能对问题有一个全新的看法)。在理论中,这应该一直有效,但特征值计算通常会失败。我们的想法是将您的列表视为从行到列的流。如果两行共享一个公共值,则它们之间存在连接流。如果我们将这些流量视为水,我们会看到流量在它们之间存在连接路径时聚集成小池。为简单起见,我将使用较小的集合,但它也适用于您的数据集:
from numpy import where, newaxis
from scipy import linalg, array, zeros
X = [[0,1,3],[2],[3,1]]
我们需要将数据转换为流程图。如果行 i 流入值 j ,我们将其放入矩阵中。这里我们有3行和4个唯一值:
A = zeros((4,len(X)), dtype=float)
for i,row in enumerate(X):
for val in row: A[val,i] = 1
通常,您需要更改4
以捕获您拥有的唯一值的数量。如果该集合是一个从0开始的整数列表,您可以简单地将其设为最大数字。我们现在执行特征值分解。准确的SVD,因为我们的矩阵不是正方形。
S = linalg.svd(A)
我们希望仅保留此答案的3x3部分,因为它将代表池的流量。实际上我们只想要这个矩阵的绝对值;我们只关心此集群空间中是否有流量。
M = abs(S[2])
我们可以将此矩阵M视为马尔可夫矩阵,并通过行规范化使其明确。一旦我们有了这个,我们计算(左)特征值分解。这个矩阵。
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
V = abs(V)
现在,断开的(非遍历)马尔可夫矩阵具有良好的性质,对于每个非连通的簇,存在一个特征值。与这些单位值相关联的特征向量是我们想要的那些:
idx = where(U > .999)[0]
C = V.T[idx] > 0
由于上述数值不稳定,我必须使用.999。在这一点上,我们完成了!现在,每个独立的集群都可以拉出相应的行:
for cluster in C:
print where(A[:,cluster].sum(axis=1))[0]
按预期给出:
[0 1 3]
[2]
将X
更改为lst
,您将获得:[ 0 1 3 4 5 10 11 16] [2 8]
。
附录
为什么这会有用?我不知道您的基础数据来自哪里,但是当连接不是绝对的时会发生什么?假设行1
在80%的时间内都有条目3
- 您如何概括问题?上面的流程方法可以很好地工作,并且会被.999
值完全参数化,离它的统一越远,关联就越松散。
视觉表现
由于图片值1K字,下面是我的例子和你的lst
的矩阵A和V的图。注意V
如何分成两个簇(它是一个块对角矩阵,在置换后有两个块),因为每个例子只有两个唯一列表!
更快的实施
事后看来,我意识到你可以跳过SVD步骤并只计算一次分解:
M = dot(A.T,A)
M /= M.sum(axis=1)[:,newaxis]
U,V = linalg.eig(M,left=True, right=False)
此方法的优势(除了速度)是M
现在是对称的,因此计算可以更快,更准确(无需担心虚数值)。
答案 3 :(得分:5)
编辑:好的,其他问题已经关闭,发布在这里。
好问题!如果您将其视为图中的连接组件问题,则会更简单。以下代码使用了出色的networkx
图表库和this question中的pairs
函数。
def pairs(lst):
i = iter(lst)
first = prev = item = i.next()
for item in i:
yield prev, item
prev = item
yield item, first
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
import networkx
g = networkx.Graph()
for sub_list in lists:
for edge in pairs(sub_list):
g.add_edge(*edge)
networkx.connected_components(g)
[[1, 2, 3, 5, 6], [8, 9, 10], [11, 12, 13]]
我们创建一个新的(空)图g
。对于lists
中的每个子列表,将其元素视为图的节点,并在它们之间添加边。 (因为我们只关心连通性,所以我们不需要添加 all 边缘 - 只有相邻的边缘!)注意add_edge
需要两个对象,将它们视为节点(并添加如果他们还没有在那里,他们就会增加他们之间的优势。
然后,我们只是找到图表的连接组件 - 一个已解决的问题! - 并将它们作为交叉集输出。
答案 4 :(得分:4)
这是我的答案。我还没有对今天的答案进行检查。
基于交叉的算法是O(N ^ 2),因为它们针对所有现有的检查每个新集合,所以我使用了一种方法来索引每个数字并在接近O(N)的情况下运行(如果我们接受字典查找是O(1))。然后我运行了基准测试,感觉就像一个完全白痴,因为它运行速度较慢,但经过仔细检查后发现测试数据最终只有少量不同的结果集,所以二次算法没有太多工作要做做。用超过10-15个不同的箱子测试它,我的算法要快得多。尝试使用超过50个不同垃圾箱的测试数据,并且非常更快。
(编辑:基准测试的运行方式也有问题,但我的诊断错误。我改变了我的代码,以便运行重复测试的方式。)
def mergelists5(data):
"""Check each number in our arrays only once, merging when we find
a number we have seen before.
"""
bins = range(len(data)) # Initialize each bin[n] == n
nums = dict()
data = [set(m) for m in data ] # Convert to sets
for r, row in enumerate(data):
for num in row:
if num not in nums:
# New number: tag it with a pointer to this row's bin
nums[num] = r
continue
else:
dest = locatebin(bins, nums[num])
if dest == r:
continue # already in the same bin
if dest > r:
dest, r = r, dest # always merge into the smallest bin
data[dest].update(data[r])
data[r] = None
# Update our indices to reflect the move
bins[r] = dest
r = dest
# Filter out the empty bins
have = [ m for m in data if m ]
print len(have), "groups in result"
return have
def locatebin(bins, n):
"""
Find the bin where list n has ended up: Follow bin references until
we find a bin that has not moved.
"""
while bins[n] != n:
n = bins[n]
return n
答案 5 :(得分:3)
这个新功能只进行最少必要数量的不相交测试,其他类似解决方案无法做到。它还使用deque
来避免尽可能多的线性时间操作,例如列表切片和从列表中早期删除。
from collections import deque
def merge(lists):
sets = deque(set(lst) for lst in lists if lst)
results = []
disjoint = 0
current = sets.pop()
while True:
merged = False
newsets = deque()
for _ in xrange(disjoint, len(sets)):
this = sets.pop()
if not current.isdisjoint(this):
current.update(this)
merged = True
disjoint = 0
else:
newsets.append(this)
disjoint += 1
if sets:
newsets.extendleft(sets)
if not merged:
results.append(current)
try:
current = newsets.pop()
except IndexError:
break
disjoint = 0
sets = newsets
return results
给定数据集中的集合之间的重叠越少,与其他函数相比,这将做得越好。
以下是一个示例案例。如果你有4套,你需要比较:
1, 2 1, 3 1, 4 2, 3 2, 4 3, 4
如果1与3重叠,则需要重新测试2以查看它是否现在与1重叠,以便安全地跳过测试2对3。
有两种方法可以解决这个问题。第一种是在每次重叠和合并之后重新开始对第1组的测试。第二个是通过比较1和4来继续测试,然后返回并重新测试。后者导致较少的不相交性测试,因为在单次传递中会发生更多的合并,因此在重新测试过程中,剩下的测试集将会更少。
问题是跟踪哪些集合需要重新测试。在上面的例子中,1需要重新测试2而不是4,因为1在第一次测试4之前已经处于当前状态。
disjoint
计数器允许跟踪此内容。
我的答案对于找到改进的重新编码到FORTRAN的算法的主要问题没有帮助;在我看来,这是用Python实现算法的最简单,最优雅的方式。
根据我的测试(或接受的答案中的测试),它比下一个最快的解决方案稍微(高达10%)。
def merge0(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
不需要在另一个中使用非Pythonic计数器(i
,range
)或复杂变异(del
,pop
,insert
)实现。它只使用简单的迭代,以最简单的方式合并重叠集,并在每次传递数据时构建一个新的列表。
我的(更快更简单)测试代码版本:
import random
tenk = range(10000)
lsts = [random.sample(tenk, random.randint(0, 500)) for _ in range(2000)]
setup = """
def merge0(lists):
newsets, sets = [set(lst) for lst in lists if lst], []
while len(sets) != len(newsets):
sets, newsets = newsets, []
for aset in sets:
for eachset in newsets:
if not aset.isdisjoint(eachset):
eachset.update(aset)
break
else:
newsets.append(aset)
return newsets
def merge1(lsts):
sets = [set(lst) for lst in lsts if lst]
merged = 1
while merged:
merged = 0
results = []
while sets:
common, rest = sets[0], sets[1:]
sets = []
for x in rest:
if x.isdisjoint(common):
sets.append(x)
else:
merged = 1
common |= x
results.append(common)
sets = results
return sets
lsts = """ + repr(lsts)
import timeit
print timeit.timeit("merge0(lsts)", setup=setup, number=10)
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
答案 6 :(得分:1)
这将是我更新的方法:
def merge(data):
sets = (set(e) for e in data if e)
results = [next(sets)]
for e_set in sets:
to_update = []
for i,res in enumerate(results):
if not e_set.isdisjoint(res):
to_update.insert(0,i)
if not to_update:
results.append(e_set)
else:
last = results[to_update.pop(-1)]
for i in to_update:
last |= results[i]
del results[i]
last |= e_set
return results
注意:在合并期间,空列表将被删除。
更新:可靠性。
您需要两次测试才能获得100%的成功可靠性:
检查所有结果集是否相互脱节:
merged = [{0, 1, 3, 4, 5, 10, 11, 16}, {8, 2}, {8}]
from itertools import combinations
for a,b in combinations(merged,2):
if not a.isdisjoint(b):
raise Exception(a,b) # just an example
检查合并的集合是否覆盖原始数据。 (由katrielalex建议)
我认为这需要一些时间,但如果您想100%确定,也许值得。
答案 7 :(得分:1)
这是使用disjoint-set data structure(特别是不相交的森林)的实现,感谢comingstorm's hint merging sets which have even one element in common。我正在使用路径压缩来提高速度(~5%);它并不是完全必要的(它会阻止find
尾部递归,这可能会减慢速度。请注意,我使用dict
来表示不相交的森林;鉴于数据是int
s,数组也可以工作,虽然它可能不会更快。
def merge(data):
parents = {}
def find(i):
j = parents.get(i, i)
if j == i:
return i
k = find(j)
if k != j:
parents[i] = k
return k
for l in filter(None, data):
parents.update(dict.fromkeys(map(find, l), find(l[0])))
merged = {}
for k, v in parents.items():
merged.setdefault(find(v), []).append(k)
return merged.values()
这种方法可与Rik基准测试中的其他最佳算法相媲美。
答案 8 :(得分:1)
首先,我不确定基准是否合理:
将以下代码添加到我的函数开头:
c = Counter(chain(*lists))
print c[1]
"88"
这意味着在所有列表中的所有值中,只有88个不同的值。通常在现实世界中,重复是罕见的,并且您会期望更多不同的值。 (当然我不知道你的数据来自哪里不能做出假设)。
因为重复项更常见,所以意味着集合不太可能不相交。这意味着set.isdisjoint()方法会更快,因为只有经过几次测试才会发现这些集合不是不相交的。
说了这么多,我确实认为使用不相交的方法是最快的,但我只是说,而不是快20倍,也许它们应该比其他基准测试不同的方法快10倍。
无论如何,我以为我会尝试一种稍微不同的技术来解决这个问题,但是合并排序太慢了,这种方法比使用基准测试的两种最快方法慢大约20倍:
我以为我会订购一切
import heapq
from itertools import chain
def merge6(lists):
for l in lists:
l.sort()
one_list = heapq.merge(*[zip(l,[i]*len(l)) for i,l in enumerate(lists)]) #iterating through one_list takes 25 seconds!!
previous = one_list.next()
d = {i:i for i in range(len(lists))}
for current in one_list:
if current[0]==previous[0]:
d[current[1]] = d[previous[1]]
previous=current
groups=[[] for i in range(len(lists))]
for k in d:
groups[d[k]].append(lists[k]) #add a each list to its group
return [set(chain(*g)) for g in groups if g] #since each subroup in each g is sorted, it would be faster to merge these subgroups removing duplicates along the way.
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
print merge6(lists)
"[set([1, 2, 3, 5, 6]), set([8, 9, 10]), set([11, 12, 13])]""
import timeit
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
print timeit.timeit("merge4(lsts)", setup=setup, number=10)
print timeit.timeit("merge6(lsts)", setup=setup, number=10)
5000 lists, 5 classes, average size 74, max size 1000
1.26732238315
5000 lists, 5 classes, average size 74, max size 1000
1.16062907437
5000 lists, 5 classes, average size 74, max size 1000
30.7257182826
答案 9 :(得分:1)
lists = [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
import networkx as nx
g = nx.Graph()
for sub_list in lists:
for i in range(1,len(sub_list)):
g.add_edge(sub_list[0],sub_list[i])
print nx.connected_components(g)
#[[1, 2, 3, 5, 6], [8, 9, 10], [11, 12, 13]]
性能:
5000 lists, 5 classes, average size 74, max size 1000
15.2264976415
merge1的表现:
print timeit.timeit("merge1(lsts)", setup=setup, number=10)
5000 lists, 5 classes, average size 74, max size 1000
1.26998780571
所以它比最快的慢11倍......但代码更简单易读!
答案 10 :(得分:1)
只是为了好玩......
def merge(mylists):
results, sets = [], [set(lst) for lst in mylists if lst]
upd, isd, pop = set.update, set.isdisjoint, sets.pop
while sets:
if not [upd(sets[0],pop(i)) for i in xrange(len(sets)-1,0,-1) if not isd(sets[0],sets[i])]:
results.append(pop(0))
return results
我重写了最佳答案
def merge(lsts):
sets = map(set,lsts)
results = []
while sets:
first, rest = sets[0], sets[1:]
merged = False
sets = []
for s in rest:
if s and s.isdisjoint(first):
sets.append(s)
else:
first |= s
merged = True
if merged: sets.append(first)
else: results.append(first)
return results
答案 11 :(得分:1)
这是一个函数(Python 3.1)来检查合并函数的结果是否正常。它检查:
from itertools import chain
def check(lsts, result):
lsts = [set(s) for s in lsts]
all_items = set(chain(*lsts))
all_result_items = set(chain(*result))
num_result_items = sum(len(s) for s in result)
if num_result_items != len(all_result_items):
print("Error: result sets overlap!")
print(num_result_items, len(all_result_items))
print(sorted(map(len, result)), sorted(map(len, lsts)))
if all_items != all_result_items:
print("Error: result doesn't match input lists!")
if not all(any(set(s).issubset(t) for t in result) for s in lst):
print("Error: not all input lists are contained in a result set!")
seen = set()
todo = list(filter(bool, lsts))
done = False
while not done:
deletes = []
for i, s in enumerate(todo): # intersection with seen, or with unseen result set, is OK
if not s.isdisjoint(seen) or any(t.isdisjoint(seen) for t in result if not s.isdisjoint(t)):
seen.update(s)
deletes.append(i)
for i in reversed(deletes):
del todo[i]
done = not deletes
if todo:
print("Error: A result set should be split into two or more parts!")
print(todo)
答案 12 :(得分:1)
这比Niklas提供的解决方案慢(我在test.txt上获得了3.9秒而不是他的解决方案的0.5秒),但产生了相同的结果,可能更容易实现,例如Fortran,因为它不使用集合,只对元素总量进行排序,然后单个遍历所有元素。
它返回一个包含合并列表的ID的列表,因此也跟踪空列表,它们保持未合并状态。
def merge(lsts):
# this is an index list that stores the joined id for each list
joined = range(len(lsts))
# create an ordered list with indices
indexed_list = sorted((el,index) for index,lst in enumerate(lsts) for el in lst)
# loop throught the ordered list, and if two elements are the same and
# the lists are not yet joined, alter the list with joined id
el_0,idx_0 = None,None
for el,idx in indexed_list:
if el == el_0 and joined[idx] != joined[idx_0]:
old = joined[idx]
rep = joined[idx_0]
joined = [rep if id == old else id for id in joined]
el_0, idx_0 = el, idx
return joined
答案 13 :(得分:0)
使用flag
确保获得最终的互斥结果
def merge(lists):
while(1):
flag=0
for i in range(0,len(lists)):
for j in range(i+1,len(lists)):
if len(intersection(lists[i],lists[j]))!=0:
lists[i]=union(lists[i],lists[j])
lists.remove(lists[j])
flag+=1
break
if flag==0:
break
return lists
答案 14 :(得分:0)
from itertools import combinations
def merge(elements_list):
d = {index: set(elements) for index, elements in enumerate(elements_list)}
while any(not set.isdisjoint(d[i], d[j]) for i, j in combinations(d.keys(), 2)):
merged = set()
for i, j in combinations(d.keys(), 2):
if not set.isdisjoint(d[i], d[j]):
d[i] = set.union(d[i], d[j])
merged.add(j)
for k in merged:
d.pop(k)
return [v for v in d.values() if v]
lst = [[65, 17, 5, 30, 79, 56, 48, 62],
[6, 97, 32, 93, 55, 14, 70, 32],
[75, 37, 83, 34, 9, 19, 14, 64],
[43, 71],
[],
[89, 49, 1, 30, 28, 3, 63],
[35, 21, 68, 94, 57, 94, 9, 3],
[16],
[29, 9, 97, 43],
[17, 63, 24]]
print(merge(lst))
答案 15 :(得分:-1)
我的解决方案适用于小型列表,并且在没有依赖性的情况下非常易读。
def merge_list(starting_list):
final_list = []
for i,v in enumerate(starting_list[:-1]):
if set(v)&set(starting_list[i+1]):
starting_list[i+1].extend(list(set(v) - set(starting_list[i+1])))
else:
final_list.append(v)
final_list.append(starting_list[-1])
return final_list
基准测试:
列表= [[1,2,3],[3,5,6],[8,9,10],[11,12,13]]
%timeit merge_list(列表)
100000个循环,最佳3:每循环4.9μs
答案 16 :(得分:-1)
这可以通过使用union-find算法在O(n)中解决。给定数据的前两行,在union-find中使用的边是以下对: (0,1),(1,3),(1,0),(0,3),(3,4),(4,5),(5,10)