以下代码为给定列表生成长度为k
(k子集分区)的所有分区。
该算法可以在this主题中找到。
def algorithm_u(ns, m):
def visit(n, a):
ps = [[] for i in xrange(m)]
for j in xrange(n):
ps[a[j + 1]].append(ns[j])
return ps
def f(mu, nu, sigma, n, a):
if mu == 2:
yield visit(n, a)
else:
for v in f(mu - 1, nu - 1, (mu + sigma) % 2, n, a):
yield v
if nu == mu + 1:
a[mu] = mu - 1
yield visit(n, a)
while a[nu] > 0:
a[nu] = a[nu] - 1
yield visit(n, a)
elif nu > mu + 1:
if (mu + sigma) % 2 == 1:
a[nu - 1] = mu - 1
else:
a[mu] = mu - 1
if (a[nu] + sigma) % 2 == 1:
for v in b(mu, nu - 1, 0, n, a):
yield v
else:
for v in f(mu, nu - 1, 0, n, a):
yield v
while a[nu] > 0:
a[nu] = a[nu] - 1
if (a[nu] + sigma) % 2 == 1:
for v in b(mu, nu - 1, 0, n, a):
yield v
else:
for v in f(mu, nu - 1, 0, n, a):
yield v
def b(mu, nu, sigma, n, a):
if nu == mu + 1:
while a[nu] < mu - 1:
yield visit(n, a)
a[nu] = a[nu] + 1
yield visit(n, a)
a[mu] = 0
elif nu > mu + 1:
if (a[nu] + sigma) % 2 == 1:
for v in f(mu, nu - 1, 0, n, a):
yield v
else:
for v in b(mu, nu - 1, 0, n, a):
yield v
while a[nu] < mu - 1:
a[nu] = a[nu] + 1
if (a[nu] + sigma) % 2 == 1:
for v in f(mu, nu - 1, 0, n, a):
yield v
else:
for v in b(mu, nu - 1, 0, n, a):
yield v
if (mu + sigma) % 2 == 1:
a[nu - 1] = 0
else:
a[mu] = 0
if mu == 2:
yield visit(n, a)
else:
for v in b(mu - 1, nu - 1, (mu + sigma) % 2, n, a):
yield v
n = len(ns)
a = [0] * (n + 1)
for j in xrange(1, m + 1):
a[n - m + j] = j - 1
return f(m, n, 0, n, a)
我们知道给定列表的k子集数等于Stirling number
,对于某些大型列表来说可能非常大。
上面的代码返回一个Python生成器,它可以通过调用下一个方法为给定列表生成所有可能的k子集分区。因此,如果我想随机获得其中一个分区,我必须在一些随机时间调用next方法(如果Stirling数字很大,这会使它非常慢)或使用itertools.islice
方法得到一个像以前一样真的很慢的一片大小。
我试图避免列出所有分区,因为这会浪费时间和速度甚至内存(因为计算很多,内存在我的情况下很重要)。
问题是我怎样才能生成k个子集分区中的一个而不生成其余的?或者至少使程序比上面解释的更快。我需要性能,因为我每次只需要其中一个,并且我运行的应用程序可能超过一千万次。
我很感激任何帮助。
编辑:示例
列表:{ 1, 2, 3 }
表示k = 3:
{ {1}, {2}, {3} }
表示k = 2:
{ {1, 2}, {3} }
{ {1, 3}, {2} }
{ {1}, {2, 3} }
,对于k = 1:
{ {1, 2, 3} }
考虑k = 2,有什么办法可以随机生成这3个分区中的一个,而不生成其他2个分区?请注意,我想为任何给定的k生成随机分区,不仅是任何k的随机分区,这意味着如果我将k设置为2,我只想生成这3个中的一个而不是所有5中的一个。
此致
穆罕默德
答案 0 :(得分:12)
通过存储先前计算的值,您可以使用递归算法有效地计算斯特林数:
fact=[1]
def nCr(n,k):
"""Return number of ways of choosing k elements from n"""
while len(fact)<=n:
fact.append(fact[-1]*len(fact))
return fact[n]/(fact[k]*fact[n-k])
cache = {}
def count_part(n,k):
"""Return number of ways of partitioning n items into k non-empty subsets"""
if k==1:
return 1
key = n,k
if key in cache:
return cache[key]
# The first element goes into the next partition
# We can have up to y additional elements from the n-1 remaining
# There will be n-1-y left over to partition into k-1 non-empty subsets
# so n-1-y>=k-1
# y<=n-k
t = 0
for y in range(0,n-k+1):
t += count_part(n-1-y,k-1) * nCr(n-1,y)
cache[key] = t
return t
一旦你知道有多少选择,你可以调整这个递归代码来生成一个特定的分区:
def ith_subset(A,k,i):
"""Return ith k-subset of A"""
# Choose first element x
n = len(A)
if n==k:
return A
if k==0:
return []
for x in range(n):
# Find how many cases are possible with the first element being x
# There will be n-x-1 left over, from which we choose k-1
extra = nCr(n-x-1,k-1)
if i<extra:
break
i -= extra
return [A[x]] + ith_subset(A[x+1:],k-1,i)
def gen_part(A,k,i):
"""Return i^th k-partition of elements in A (zero-indexed) as list of lists"""
if k==1:
return [A]
n=len(A)
# First find appropriate value for y - the extra amount in this subset
for y in range(0,n-k+1):
extra = count_part(n-1-y,k-1) * nCr(n-1,y)
if i<extra:
break
i -= extra
# We count through the subsets, and for each subset we count through the partitions
# Split i into a count for subsets and a count for the remaining partitions
count_partition,count_subset = divmod(i,nCr(n-1,y))
# Now find the i^th appropriate subset
subset = [A[0]] + ith_subset(A[1:],y,count_subset)
S=set(subset)
return [subset] + gen_part([a for a in A if a not in S],k-1,count_partition)
作为一个例子,我编写了一个测试程序,它产生4个数字的不同分区:
def test(A):
n=len(A)
for k in [1,2,3,4]:
t = count_part(n,k)
print k,t
for i in range(t):
print " ",i,gen_part(A,k,i)
test([1,2,3,4])
此代码打印:
1 1
0 [[1, 2, 3, 4]]
2 7
0 [[1], [2, 3, 4]]
1 [[1, 2], [3, 4]]
2 [[1, 3], [2, 4]]
3 [[1, 4], [2, 3]]
4 [[1, 2, 3], [4]]
5 [[1, 2, 4], [3]]
6 [[1, 3, 4], [2]]
3 6
0 [[1], [2], [3, 4]]
1 [[1], [2, 3], [4]]
2 [[1], [2, 4], [3]]
3 [[1, 2], [3], [4]]
4 [[1, 3], [2], [4]]
5 [[1, 4], [2], [3]]
4 1
0 [[1], [2], [3], [4]]
作为另一个例子,有1千万个分区1,2,3,... 14分为4个部分。 此代码可以使用pypy在44秒内生成所有分区。
有50,369,882,873,307,917,364,901分区1,2,3,...,40分为4个部分。这个代码可以在120秒内生成1000万个这样的代码,而pypy在单个处理器上运行。
要将事物联系在一起,您可以使用此代码生成列表A的单个随机分区到k个非空子集:
import random
def random_ksubset(A,k):
i = random.randrange(0,count_part(len(A),k))
return gen_part(A,k,i)
答案 1 :(得分:5)
这样的事情怎么样:
import itertools
import random
def random_ksubset(ls, k):
# we need to know the length of ls, so convert it into a list
ls = list(ls)
# sanity check
if k < 1 or k > len(ls):
return []
# Create a list of length ls, where each element is the index of
# the subset that the corresponding member of ls will be assigned
# to.
#
# We require that this list contains k different values, so we
# start by adding each possible different value.
indices = list(range(k))
# now we add random values from range(k) to indices to fill it up
# to the length of ls
indices.extend([random.choice(list(range(k))) for _ in range(len(ls) - k)])
# shuffle the indices into a random order
random.shuffle(indices)
# construct and return the random subset: sort the elements by
# which subset they will be assigned to, and group them into sets
return [{x[1] for x in xs} for (_, xs) in
itertools.groupby(sorted(zip(indices, ls)), lambda x: x[0])]
这会生成随机k子集分区,如下所示:
>>> ls = {1,2,3}
>>> print(random_ksubset(ls, 2))
[set([1, 2]), set([3])]
>>> print(random_ksubset(ls, 2))
[set([1, 3]), set([2])]
>>> print(random_ksubset(ls, 2))
[set([1]), set([2, 3])]
>>> print(random_ksubset(ls, 2))
[set([1]), set([2, 3])]
这种方法满足了OP获取一个随机生成的分区的要求,而不需要枚举所有可能的分区。这里的内存复杂性是线性的由于排序,运行时复杂度为O(N log N)。我想如果这很重要,可能会使用更复杂的构造返回值的方法将其降低到线性。
正如@Leon指出的那样,这满足了他的选项2在尝试定义问题时的要求。这不能做的是确定性地生成分区#N(这是Leon的选项1,这将允许您随机选择一个整数N然后检索相应的分区)。莱昂的澄清很重要,因为为了满足问题的精神,应该以相同的概率生成集合的每个可能的划分。关于我们的玩具问题,情况就是这样:
>>> from collections import Counter
>>> Counter(frozenset(map(frozenset, random_ksubset(ls, 2))) for _ in range(10000))
Counter({frozenset({frozenset({2, 3}), frozenset({1})}): 3392,
frozenset({frozenset({1, 3}), frozenset({2})}): 3212,
frozenset({frozenset({1, 2}), frozenset({3})}): 3396})
然而。通常,此方法不会以相同的概率生成每个分区。考虑:
>>> Counter(frozenset(map(frozenset, random_ksubset(range(4), 2)))
... for _ in range(10000)).most_common()
[(frozenset({frozenset({1, 3}), frozenset({0, 2})}), 1671),
(frozenset({frozenset({1, 2}), frozenset({0, 3})}), 1667),
(frozenset({frozenset({2, 3}), frozenset({0, 1})}), 1642),
(frozenset({frozenset({0, 2, 3}), frozenset({1})}), 1285),
(frozenset({frozenset({2}), frozenset({0, 1, 3})}), 1254),
(frozenset({frozenset({0, 1, 2}), frozenset({3})}), 1245),
(frozenset({frozenset({1, 2, 3}), frozenset({0})}), 1236)]
我们在这里可以看到,我们更有可能产生更平衡的&#34;分区(因为有更多的方法来构建这些)。包含单例集的分区生成频率较低。
似乎是集合is sort of an unsolved research question的k分区上的有效均匀抽样方法(也见mathoverflow)。 Nijenhuis和Wilf为所有分区提供了采样代码(第12章),这可以与拒绝测试一起使用,@ PeterdeRivaz的answer也可以统一采样k分区。这两种方法的缺点是它们需要计算斯特林数,它在n中呈指数增长,算法是递归的,我认为这会使它们在大输入上变慢。正如你提到的那样,数百万&#34;你评论中的分区,我认为这些方法只能处理一定的输入大小。
一个。 Nijenhuis和H. Wilf。计算机与计算机的组合算法 计算器。学术出版社,Orlando FL,第二版,1978年。
探索莱昂的选项1可能很有趣。这是一个粗略的过程,使用@Amadan建议将整数值解释为k-ary数来确定性地生成集合的特定分区。请注意,并非每个整数值都会生成有效的k子集分区(因为我们不允许空子集):
def amadan(ls, N, k):
"""
Given a collection `ls` with length `b`, a value `k`, and a
"partition number" `N` with 0 <= `N` < `k**b`, produce the Nth
k-subset paritition of `ls`.
"""
ls = list(ls)
b = len(ls)
if not 0 <= N < k**b: return None
# produce the k-ary index vector from the number N
index = []
# iterate through each of the subsets
for _ in range(b):
index.append(N % k)
N //= k
# subsets cannot be empty
if len(set(index)) != k: return None
return frozenset(frozenset(x[1] for x in xs) for (_, xs) in
itertools.groupby(sorted(zip(index, ls)),
lambda x:x[0]))
我们可以确认这会正确生成Stirling numbers:
>>> for i in [(4,1), (4,2), (4,3), (4,4), (5,1), (5,2), (5,3), (5,4), (5,5)]:
... b,k = i
... r = [amadan(range(b), N, k) for N in range(k**b)]
... r = [x for x in r if x is not None]
... print(i, len(set(r)))
(4, 1) 1
(4, 2) 7
(4, 3) 6
(4, 4) 1
(5, 1) 1
(5, 2) 15
(5, 3) 25
(5, 4) 10
(5, 5) 1
这也可以以相同的概率产生每个可能的分区;我不太确定。这是一个测试用例,它起作用:
>>> b,k = 4,3
>>> r = [amadan(range(b), N, k) for N in range(k**b)]
>>> r = [x for x in r if x is not None]
>>> print(Counter([' '.join(sorted(''.join(map(str, x)) for x in p)) for p in r]))
Counter({'0 13 2': 6,
'01 2 3': 6,
'0 12 3': 6,
'03 1 2': 6,
'02 1 3': 6,
'0 1 23': 6})
另一个工作案例:
>>> b,k = 5,4
>>> r = [amadan(range(b), N, k) for N in range(k**b)]
>>> r = [x for x in r if x is not None]
>>> print(Counter([' '.join(sorted(''.join(map(str, x)) for x in p)) for p in r]))
Counter({'0 12 3 4': 24,
'04 1 2 3': 24,
'0 1 23 4': 24,
'01 2 3 4': 24,
'03 1 2 4': 24,
'0 13 2 4': 24,
'0 1 24 3': 24,
'02 1 3 4': 24,
'0 1 2 34': 24,
'0 14 2 3': 24})
所以,把它包装成一个函数:
def random_ksubset(ls, k):
ls = list(ls)
maxn = k**len(ls)-1
rv = None
while rv is None:
rv = amadan(ls, random.randint(0, maxn), k)
return rv
然后我们可以做到:
>>> random_ksubset(range(3), 2)
frozenset({frozenset({2}), frozenset({0, 1})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({1, 2}), frozenset({0})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({1, 2}), frozenset({0})})
>>> random_ksubset(range(3), 2)
frozenset({frozenset({2}), frozenset({0, 1})})