我有一些表示事件顺序的字符串:
s1 = 'A->B->E->D->A->C->B->D'
s2= 'A->B->C->A->B'
s3 = 'A->B->A
在每个字符串中,我想找到最大长度为N的所有重复模式。
import itertools
def find_all_comb(event_list,max_events):
all_combs = []
for j in range(1,max_events+1):
all_combs.extend(list(set(['->'.join(x) for x in list(itertools.combinations(event_list,j))])))
return all_combs
def find_repeating_patterns(x):
split_events = x.split("->")
all_combs = find_all_comb(split_events,int(len(x)/2))
repeating_patterns = []
for comb in all_combs:
c_split_event = [p for p in split_events if p in comb]
if '->'.join(c_split_event).count(comb) > 1:
repeating_patterns.extend([comb])
output_list = []
longest_repeating_patterns = [s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
while output_list != longest_repeating_patterns:
if longest_repeating_patterns == []:
break
output_list = longest_repeating_patterns.copy()
longest_repeating_patterns = [s for s in longest_repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in longest_repeating_patterns)]
return output_list
对于s1
,它返回正确的模式[A,B,D]
,对于s2
,它返回[A,B]
。对于s3
,它应该返回[A]
,但是它返回一个空列表。这是因为行:
[s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
不允许len(s) > len(i)
。
我如何在这里捕获这两种情况?
答案 0 :(得分:1)
这是一个更简单,更有效的解决方案:
def longest_subsequence(events, limit, sep='->'):
events = list(enumerate(events.split(sep)))
output = {}
seen = {}
for n in range(limit, 0, -1):
for combination in itertools.combinations(events, n):
indexes, key = zip(*combination)
if key in seen:
if key not in output and seen[key].isdisjoint(indexes):
output[key] = sep.join(key)
else:
seen[key] = set(indexes)
if output:
break
return list(output.values())
此命令首先查看最长的匹配项,如果找到匹配项,则尽早终止。通过保存最后一个匹配的索引并将其与当前候选索引进行比较,可以消除自重叠的重复子序列。
演示:
samples = (
'A->B->E->D->A->C->B->D',
'A->B->C->A->B',
'A->B->A',
'A->B->E->D->A->C->B->E->D->A',
'B->B->B->C->C',
'A->B->A->B->C->C',
'A',
'',
)
for index, sample in enumerate(samples, 1):
result = longest_subsequence(sample, 4)
print('(%s) %r\n%s\n' % (index, sample, result))
输出:
(1) 'A->B->E->D->A->C->B->D'
['A->B->D']
(2) 'A->B->C->A->B'
['A->B']
(3) 'A->B->A'
['A']
(4) 'A->B->E->D->A->C->B->E->D->A'
['A->B->E->D', 'B->E->D->A']
(5) 'B->B->B->C->C'
['B->C']
(6) 'A->B->A->B->C->C'
['A->B->C']
(7) 'A'
[]
(8) ''
[]
答案 1 :(得分:0)
添加一条额外的行来向output_list
中的任何序列中添加不属于子序列的所有内容,这是一个解决方案。
import itertools
def find_all_comb(event_list,max_events):
all_combs = []
for j in range(1,max_events+1):
all_combs.extend(list(set(['->'.join(x) for x in list(itertools.combinations(event_list,j))])))
return all_combs
def find_repeating_patterns(x):
split_events = x.split("->")
all_combs = find_all_comb(split_events,int(len(x)/2))
repeating_patterns = []
for comb in all_combs:
c_split_event = [p for p in split_events if p in comb]
if '->'.join(c_split_event).count(comb) > 1:
repeating_patterns.extend([comb])
output_list = []
longest_repeating_patterns = [s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
while output_list != longest_repeating_patterns:
if longest_repeating_patterns == []:
break
output_list = longest_repeating_patterns.copy()
longest_repeating_patterns = [s for s in longest_repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in longest_repeating_patterns)]
output_list.extend([s for s in repeating_patterns if not any(set(i).issuperset(set(s)) for i in output_list)]) <--- ADDED LINE FOR SOLUTION
return output_list
s1 = A->B->E->D->A->C->B->D
s2 = A->B->C->A->B
s3 = A->B->A
print(find_repeating_patterns(s1))
output: [A->B->D]
print(find_repeating_patterns(s2))
output: [A->B]
print(find_repeating_patterns(s3))
output: [A]