我需要一个函数将一个iterable拆分成块,并选择在块之间重叠。
我写了下面的代码,它给了我正确的输出,但这是非常低效(慢)。我无法弄清楚如何加快速度。有更好的方法吗?
def split_overlap(seq, size, overlap):
'''(seq,int,int) => [[...],[...],...]
Split a sequence into chunks of a specific size and overlap.
Works also on strings!
Examples:
>>> split_overlap(seq=list(range(10)),size=3,overlap=2)
[[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8], [7, 8, 9]]
>>> split_overlap(seq=range(10),size=3,overlap=2)
[range(0, 3), range(1, 4), range(2, 5), range(3, 6), range(4, 7), range(5, 8), range(6, 9), range(7, 10)]
>>> split_overlap(seq=list(range(10)),size=7,overlap=2)
[[0, 1, 2, 3, 4, 5, 6], [5, 6, 7, 8, 9]]
'''
if size < 1 or overlap < 0:
raise ValueError('"size" must be an integer with >= 1 while "overlap" must be >= 0')
result = []
while True:
if len(seq) <= size:
result.append(seq)
return result
else:
result.append(seq[:size])
seq = seq[size-overlap:]
到目前为止测试结果:
l = list(range(10))
s = 4
o = 2
print(split_overlap(l,s,o))
print(list(split_overlap_jdehesa(l,s,o)))
print(list(nwise_overlap(l,s,o)))
print(list(split_overlap_Moinuddin(l,s,o)))
print(list(gen_split_overlap(l,s,o)))
print(list(itr_split_overlap(l,s,o)))
[[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7], [6, 7, 8, 9]]
[(0, 1, 2, 3), (2, 3, 4, 5), (4, 5, 6, 7), (6, 7, 8, 9)]
[(0, 1, 2, 3), (2, 3, 4, 5), (4, 5, 6, 7), (6, 7, 8, 9), (8, 9, None, None)] #wrong
[[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7], [6, 7, 8, 9], [8, 9]] #wrong
[[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7], [6, 7, 8, 9]]
[(0, 1, 2, 3), (2, 3, 4, 5), (4, 5, 6, 7), (6, 7, 8, 9)]
%%timeit
split_overlap(l,7,2)
718 ns ± 2.36 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%%timeit
list(split_overlap_jdehesa(l,7,2))
4.02 µs ± 64.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
list(nwise_overlap(l,7,2))
5.05 µs ± 102 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
list(split_overlap_Moinuddin(l,7,2))
3.89 µs ± 78.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%%timeit
list(gen_split_overlap(l,7,2))
1.22 µs ± 13.3 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%%timeit
list(itr_split_overlap(l,7,2))
3.41 µs ± 36.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
将较长的列表作为输入:
l = list(range(100000))
%%timeit
split_overlap(l,7,2)
4.27 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%timeit
list(split_overlap_jdehesa(l,7,2))
31.1 ms ± 495 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
list(nwise_overlap(l,7,2))
5.74 ms ± 66 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
list(split_overlap_Moinuddin(l,7,2))
16.9 ms ± 89.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
list(gen_split_overlap(l,7,2))
4.54 ms ± 22.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
list(itr_split_overlap(l,7,2))
19.1 ms ± 240 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
从其他测试(此处未报告),结果发现对于小型列表len(list) <= 100
,我的原始实现split_overlap()
是最快的。但是对于任何大于此的事情,到目前为止,gen_split_overlap()
是最有效的解决方案。
答案 0 :(得分:3)
有时可读性与速度有关。一个简单的生成器迭代索引,生成切片可以在合理的时间内完成工作:
def gen_split_overlap(seq, size, overlap):
if size < 1 or overlap < 0:
raise ValueError('size must be >= 1 and overlap >= 0')
for i in range(0, len(seq) - overlap, size - overlap):
yield seq[i:i + size]
如果你想处理潜在的无限可迭代,你只需要保持重叠项与前一个yield和slice size - 重叠新项:
def itr_split_overlap(iterable, size, overlap):
itr = iter(iterable)
# initial slice, in case size exhausts iterable on the spot
next_ = tuple(islice(itr, size))
yield next_
# overlap for initial iteration
prev = next_[-overlap:] if overlap else ()
# For long lists the repeated calls to a lambda are slow, but using
# the 2-argument form of `iter()` is in general a nice trick.
#for chunk in iter(lambda: tuple(islice(itr, size - overlap)), ()):
while True:
chunk = tuple(islice(itr, size - overlap))
if not chunk:
break
next_ = (*prev, *chunk)
yield next_
# overlap == 0 is a special case
if overlap:
prev = next_[-overlap:]
答案 1 :(得分:2)
您可以使用zip
和列表理解创建自定义函数,以实现此目的:
def split_overlap(seq, size, overlap):
return [x for x in zip(*[seq[i::size-overlap] for i in range(size)])]
示例运行
# Chunk size: 3
# Overlap: 2
>>> split_overlap(list(range(10)), 3, 2)
[(0, 1, 2), (1, 2, 3), (2, 3, 4), (3, 4, 5), (4, 5, 6), (5, 6, 7), (6, 7, 8), (7, 8, 9)]
# Chunk size: 3
# Overlap: 1
>>> split_overlap(list(range(10)), 3, 1)
[(0, 1, 2), (2, 3, 4), (4, 5, 6), (6, 7, 8)]
# Chunk size: 4
# Overlap: 1
>>> split_overlap(list(range(10)), 4, 1)
[(0, 1, 2, 3), (3, 4, 5, 6), (6, 7, 8, 9)]
# Chunk size: 4
# Overlap: 2
>>> split_overlap(list(range(10)), 4, 2)
[(0, 1, 2, 3), (2, 3, 4, 5), (4, 5, 6, 7), (6, 7, 8, 9)]
# Chunk size: 4
# Overlap: 1
>>> split_overlap(list(range(10)), 4, 3)
[(0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5), (3, 4, 5, 6), (4, 5, 6, 7), (5, 6, 7, 8), (6, 7, 8, 9)]
如果你想显示块,即使它们不满足块大小的先决条件,那么你应该使用Python 3.x中的itertools.zip_longest
(相当于{ Python 2.x中的{3}}。
此外,这是动态屈服动态的变体,如果您有大量列表,这在内存方面更有效:
# Python 3.x
from itertools import zip_longest as iterzip
# Python 2.x
from itertools import izip_longest as iterzip
# Generator function
def split_overlap(seq, size, overlap):
for x in iterzip(*[my_list[i::size-overlap] for i in range(size)]):
yield tuple(i for i in x if i!=None) if x[-1]==None else x
# assuming that your initial list is ^
# not containing the `None`, use of `iterzip` is based
# on the same assumption
示例运行:
# v type-cast to list in order to display the result,
# v not required during iterations
>>> list(split_overlap(list(range(10)),7,2))
[[0, 1, 2, 3, 4, 5, 6], [5, 6, 7, 8, 9]]
答案 2 :(得分:1)
你的方法与它将获得的一样好,你需要轮询序列/ iterable并构建块,但无论如何,这是一个与iterables一起使用并使用deque
的惰性版本性能:
from collections import deque
def split_overlap(iterable, size, overlap=0):
size = int(size)
overlap = int(overlap)
if size < 1 or overlap < 0 or overlap >= size:
raise ValueError()
pops = size - overlap
q = deque(maxlen=size)
for elem in iterable:
q.append(elem)
if len(q) == size:
yield tuple(q)
for _ in range(pops):
q.popleft()
# Yield final incomplete tuple if necessary
if len(q) > overlap:
yield tuple(q)
>>> list(split_overlap(range(10), 4, 2))
[(0, 1, 2, 3), (3, 4, 5, 6), (6, 7, 8, 9)]
>>> list(split_overlap(range(10), 5, 2))
[(0, 1, 2, 3, 4), (3, 4, 5, 6, 7), (6, 7, 8, 9)]
注意:如果输入没有产生确切数量的块,则生成器会产生最后一个不完整的元组(参见第二个示例)。如果您想避免这种情况,请删除最终的if len(q) > overlap: yield tuple(q)
。
答案 3 :(得分:0)
您可以尝试使用
itertools.izip(...)
这对大型列表很有用,因为它返回迭代器而不是列表。
像这样:import itertools
def split_overlap(iterable, size, overlap):
'''(iter,int,int) => [[...],[...],...]
Split an iterable into chunks of a specific size and overlap.
Works also on strings!
Examples:
>>> split_overlap(iterable=list(range(10)),size=3,overlap=2)
[[0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8], [7, 8, 9]]
>>> split_overlap(iterable=range(10),size=3,overlap=2)
[range(0, 3), range(1, 4), range(2, 5), range(3, 6), range(4, 7), range(5, 8), range(6, 9), range(7, 10)]
'''
if size < 1 or overlap < 0:
raise ValueError('"size" must be an integer with >= 1 while "overlap" must be >= 0')
result = []
for i in itertools.izip(*[iterable[i::size-overlap] for i in range(size)]):
result.append(i)
return result