在Python的标准库中发现difflib.SequenceMatcher
类不适合我的需求之后,编写了一个通用的“diff”-ing模块来解决问题空间。经过几个月的思考更多关于它正在做什么之后,递归算法似乎在搜索更多内容,而不是通过重新搜索一个单独的“搜索线程”也可能已经检查过的序列中的相同区域。
diff
模块的目的是计算一对序列(列表,元组,字符串,字节,字节数组等)之间的差异和相似性。初始版本比代码的当前形式慢得多,速度提高了十倍。如何将memoization应用于以下代码?重写算法以进一步提高速度的最佳方法是什么?
class Slice:
__slots__ = 'prefix', 'root', 'suffix'
def __init__(self, prefix, root, suffix):
self.prefix = prefix
self.root = root
self.suffix = suffix
################################################################################
class Match:
__slots__ = 'a', 'b', 'prefix', 'suffix', 'value'
def __init__(self, a, b, prefix, suffix, value):
self.a = a
self.b = b
self.prefix = prefix
self.suffix = suffix
self.value = value
################################################################################
class Tree:
__slots__ = 'nodes', 'index', 'value'
def __init__(self, nodes, index, value):
self.nodes = nodes
self.index = index
self.value = value
################################################################################
def search(a, b):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = a[a_addr:a_term]
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = b[b_addr:b_term]
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
a_pref, b_pref = a[:a_addr], b[:b_addr]
p_tree = search(a_pref, b_pref)
# Create suffix tree to search.
a_suff, b_suff = a[a_term:], b[b_term:]
s_tree = search(a_suff, b_suff)
# Make completed slice objects.
a_slic = Slice(a_pref, a_root, a_suff)
b_slic = Slice(b_pref, b_root, b_suff)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slic, b_slic, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)
参考: How to optimize a recursive algorithm to not repeat itself?
答案 0 :(得分:1)
您可以使用Python Decorator Library中的memoize装饰器 并像这样使用它:
@memoized
def search(a, b):
第一次使用参数search
调用a,b
时,会计算并记忆结果(保存在缓存中)。第二次使用相同的参数调用search
,结果从缓存返回。
请注意,要使memoized
装饰器起作用,参数必须是可清除的。如果a
和b
是数字元组,则它们是可清除的。如果它们是列表,那么您可以在将它们传递给search
之前将它们转换为元组。
它看起来不像search
将dicts
作为参数,但如果它们是,则they would not be hashable和memoization装饰器将无法将结果保存在缓存中。
答案 1 :(得分:1)
as~unutbu说,试试memoized decorator并进行以下更改:
@memoized
def search(a, b):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = list(a)[a_addr:a_term] #change to list
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = list(b)[b_addr:b_term] #change to list
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
a_pref, b_pref = list(a)[:a_addr], list(b)[:b_addr]
p_tree = search(a_pref, b_pref)
# Create suffix tree to search.
a_suff, b_suff = list(a)[a_term:], list(b)[b_term:]
s_tree = search(a_suff, b_suff)
# Make completed slice objects.
a_slic = Slice(a_pref, a_root, a_suff)
b_slic = Slice(b_pref, b_root, b_suff)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slic, b_slic, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)
对于memoization,词典是最好的,但它们不能被切片,因此必须将它们更改为列表,如上面的注释中所示。
答案 2 :(得分:0)
问这个问题已有9年了,但是如今,内部缓存结果以加速算法的概念终于被应用到了代码中。该应用程序的结果如下所示:
#! /usr/bin/env python3
"""Compute differences and similarities between a pair of sequences.
After finding the "difflib.SequenceMatcher" class unsuitable, this module
was written and re-written several times into the polished version below."""
__author__ = 'Stephen "Zero" Chappell <Noctis.Skytower@gmail.com>'
__date__ = '3 September 2019'
__version__ = '$Revision: 4 $'
class Slice:
__slots__ = 'prefix', 'root', 'suffix'
def __init__(self, prefix, root, suffix):
self.prefix = prefix
self.root = root
self.suffix = suffix
class Match:
__slots__ = 'a', 'b', 'prefix', 'suffix', 'value'
def __init__(self, a, b, prefix, suffix, value):
self.a = a
self.b = b
self.prefix = prefix
self.suffix = suffix
self.value = value
class Tree:
__slots__ = 'nodes', 'index', 'value'
def __init__(self, nodes, index, value):
self.nodes = nodes
self.index = index
self.value = value
def search(a, b):
return _search(a, b, {})
def _search(a, b, memo):
# Initialize startup variables.
nodes, index = [], []
a_size, b_size = len(a), len(b)
# Begin to slice the sequences.
for size in range(min(a_size, b_size), 0, -1):
for a_addr in range(a_size - size + 1):
# Slice "a" at address and end.
a_term = a_addr + size
a_root = a[a_addr:a_term]
for b_addr in range(b_size - size + 1):
# Slice "b" at address and end.
b_term = b_addr + size
b_root = b[b_addr:b_term]
# Find out if slices are equal.
if a_root == b_root:
# Create prefix tree to search.
key = a_prefix, b_prefix = a[:a_addr], b[:b_addr]
if key not in memo:
memo[key] = _search(a_prefix, b_prefix, memo)
p_tree = memo[key]
# Create suffix tree to search.
key = a_suffix, b_suffix = a[a_term:], b[b_term:]
if key not in memo:
memo[key] = _search(a_suffix, b_suffix, memo)
s_tree = memo[key]
# Make completed slice objects.
a_slice = Slice(a_prefix, a_root, a_suffix)
b_slice = Slice(b_prefix, b_root, b_suffix)
# Finish the match calculation.
value = size + p_tree.value + s_tree.value
match = Match(a_slice, b_slice, p_tree, s_tree, value)
# Append results to tree lists.
nodes.append(match)
index.append(value)
# Return largest matches found.
if nodes:
return Tree(nodes, index, max(index))
# Give caller null tree object.
return Tree(nodes, index, 0)