作为Large list generation optimization的后续行动,我想以相反的方式解决问题。
给出以下形式的字符串列表:
seq = ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
我想要一个能够返回"合并"的列表的函数。元素,保留顺序,如:
merged = ['A.p[1:11]', 'B.p[0:3]', 'A.p[0]']
为了实现我的目标,我写了这个函数:
def merge_seq(seq):
''' Merges a given sequence of strings
Each string will consiste of word[index] or word[bottom index:top index]
ex: given ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
return ['A.p[1:11]','B.p[0:3]','A.p[0]']
'''
merged_list = []
for i in seq:
# First item? Add to list
if not merged_list:
merged_list.append(i)
else:
current = i # current item
previous = merged_list[-1] # previous item
# Skip if current == previous
if current != previous:
current = current.split('[')
previous = previous.split('[')
# If current word != previous, add to list
if current[0] != previous[0]:
merged_list.append(i)
else:
range0 = [int(x) for x in current[-1][:-1].split(':')] # current range
range1 = [int(x) for x in previous[-1][:-1].split(':')] # previous range
bottom = max(range0[0], range1[0])
top = min(range0[-1], range1[-1])
# Test if they overlap or are next to each other, if so edit last entry to reflect new range
if abs(bottom-top) == 1 or xrange(bottom,top+1):
bottom = min(range0[0], range1[0])
top = max(range0[-1], range1[-1])
merged_list[-1] = '%s[%s]'%(previous[0],('%s:%s'%(bottom,top)))
# No overlap. Add to list
else:
merged_list.append(i)
return merged_list
此函数的问题在于处理大型序列时速度非常慢。
更新
在做了一些研究后,我偶然发现this answer to detect consecutive integers in a list。这激发了我编写一个利用itertools和operator
的新功能from itertools import groupby
from operator import itemgetter
import re
def merge_seq2(seq):
''' Merges a given sequence of strings
Each string will consiste of word[index] or word[bottom index:top index]
ex: given ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
becomes ['A.p[1:11]','B.p[0:3]','A.p[0]']
'''
r = re.compile(r"([0-9a-zA-Z._]+)\[(\d+)(?::(\d+))?\]")
current = ''
values = set()
result = []
for item in seq:
m = r.match(item)
name, start, end = m.group(1), int(m.group(2)), m.group(3)
rng = xrange(start, int(end)+1) if end else (start,)
# if this is a new item and we have values, append result
if name != current:
if values:
for k, g in groupby(enumerate(sorted(values)), lambda (i,x):i-x):
m = map(itemgetter(1), g)
if len(m) == 1:
result.append('%s[%s]'%(current,m[0]))
else:
result.append('%s[%s:%s]'%(current,m[0],m[-1]))
# reset 'current' name and values
current = name
values = set(rng)
# else add to values
else:
values.update(rng)
# Do a last append to results and return
if values:
for k, g in groupby(enumerate(sorted(values)), lambda (i,x):i-x):
m = map(itemgetter(1), g)
if len(m) == 1:
result.append('%s[%s]'%(current,m[0]))
else:
result.append('%s[%s:%s]'%(current,m[0],m[-1]))
return result
现在让我们比较两个函数并使用this solution to generate large lists:
def flatten_seq(seq):
''' Answered by JuniorCompressor
https://stackoverflow.com/questions/29089435/large-list-generation-optimization/29089675#29089675
Faster than: ['%s[%s]'%(item.split('[')[0],i) for item in seq for i in xrange(int(item.split('[')[-1][:-1].split(':')[0]),int(item.split('[')[-1][:-1].split(':')[-1])+1)]
'''
r = re.compile(r"([0-9a-zA-Z._]+)\[(\d+)(?::(\d+))?\]")
result = []
for item in seq:
m = r.match(item)
name, start, end = m.group(1), int(m.group(2)), m.group(3)
rng = xrange(start, int(end)+1) if end else (start,)
t = name + "["
result.extend(t + str(i) + "]" for i in rng)
return result
# Lets make 1 million entries
seq = ['A[1:500000]','B[1:500000]']
t1 = time.clock()
flat = flatten_seq(seq)
t2 = time.clock()
print '# Flattened in %s secs'%round(t2-t1, 3)
merged = merge_seq(flat)
t3 = time.clock()
print '# Old Merge %s secs'%round(t3-t2, 3)
merged = merge_seq2(flat)
t4 = time.clock()
print '# New Merge %s secs'%round(t4-t3, 3)
# Flattened in 0.265 secs
# Old Merge 6.76 secs
# New Merge 2.613 secs
那快了〜2.5倍!
marge_seq2的唯一小问题是,在某些情况下,当给定unflattened列表时,它可能比原始的merge_seq函数慢。
如果有人建议加快速度,我很乐意听到它们!
答案 0 :(得分:0)
我决定利用我对itertools新发现的知识来解决下面的功能。它比merge_seq2快几分之一秒,并且比我原来的merge_seq函数更快。当处理大的未平坦范围(例如:0:5000000)时,由于在xrange集合中丢弃了大量数据,因此这种方法比原始方法慢一点。但在我的日常使用中,到目前为止,这作为一般解决方案更好。
def merge_seq3(seq):
''' Merges a given sequence of strings
Each string will consiste of word[index] or word[bottom index:top index]
ex: given ['A.p[11]','A.p[1:10]','B.p[1:2]','B.p[2]','B.p[3]','B.p[0]','A.p[0]']
becomes ['A.p[1:11]','B.p[0:3]','A.p[0]']
'''
result = []
for key, g in itertools.groupby(seq,lambda x:x.split('[')[0]):
values = set()
for item in g:
split = item[len(key)+1:-1].split(':')
if len(split) == 1:
values.add(int(split[0]))
else:
values.update(xrange(int(split[0]),int(split[1])+1))
if values:
for k, g in groupby(enumerate(sorted(values)), lambda (i,x):i-x):
m = map(itemgetter(1), g)
if len(m) == 1:
result.append('%s[%s]'%(key,m[0]))
else:
result.append('%s[%s:%s]'%(key,m[0],m[-1]))
return result