我编写了一个搜索算法,在排序列表中找到一个字符串,然后在任一侧搜索条目以查找重复项。
import re
found = []
missing = []
def find_media(media, drive_inv):
"""
media is a string.
drive_inv is a list of strings.
use binary search to find a match,
followed by a linear seach either side
to check for duplicates.
append a match to the global list, found.
else append to the global list, missing.
"""
def linear_search_up(media, line):
""" line is an int, to index drive_inv with. """
try:
if re.search(media, drive_inv[line+1], re.IGNORECASE):
found.append(drive_inv[line+1])
return linear_search_up(media, line+1)
else:
return
except IndexError:
return
def linear_search_down(media, line):
""" line is an int, to index drive_inv with. """
try:
if re.search(media, drive_inv[line-1], re.IGNORECASE):
found.append(drive_inv[line-1])
return linear_search_down(media, line-1)
else:
return
except IndexError:
return
def binary_search(media, low, high):
"""
low and high are ints - the boundries of the
binary search algorithm.
if a match is found, execute the linear seach
function on the entries either side.
"""
if high == low:
if re.search(media, drive_inv[low], re.IGNORECASE):
found.append(drive_inv[low])
return
else:
missing.append(media)
return
mid = (low + high) / 2
if re.search(media, drive_inv[mid], re.IGNORECASE):
found.append(drive_inv[mid])
# now check the entries either side
return (
linear_search_up(media, mid),
linear_search_down(media, mid
)
# if the filename > media, discard the larger entries
elif drive_inv[mid].split('/')[-1] > media:
if low == mid:
missing.append(media)
return
else:
return binary_search(media, low, mid-1)
# if the filename < media, discard the smaller entries
else:
return binary_search(media, mid+1, high)
if len(drive_inv) == 0:
return
else:
return binary_search(media, 0, len(drive_inv)-1)
它似乎运作良好,但它有点难看,将结果附加到全局列表。我希望它能够返回所有比赛的元组。但是,如果我改变:
found.append(drive_inv[line+1])
return linear_search_up(media, line+1)
为:
return (
drive_inv[line+1],
linear_search_up(media, line+1)
)
我最终得到一个看起来像的元组:
(('A001C002', ('A001C002', None)), ('A001C002', ('A001C002', ('A001C002', ('A001C002', None)))))
......这不太好。
这可以重写并仍然使用递归吗?或者我应该考虑不同的方法吗?
答案 0 :(得分:0)
虽然可能修改代码以使其按照您的意愿执行,但使用下面显示的方法可以实现同样的目标 - 这可能更快,并且不需要对字符串列表进行排序。 / p>
from collections import Counter
def find_media(media, drive_inv):
cnt = Counter(drive_inv).get(media, 0)
return (media,)*cnt if cnt else None
drive_inv = ['A001C000', 'A001C000', 'A001C001', 'A001C002', 'A001C002',
'A001C002', 'A001C003', 'A001C003', 'A001C003', 'A001C004',
'A001C005']
print find_media('A001C002', drive_inv) # -> ('A001C002', 'A001C002', 'A001C002')
print find_media('A001C099', drive_inv) # -> None
如果您希望在找不到media
而不是None
时返回空元组,请将函数的return语句更改为:
return (media,)*cnt