概述
我有一组可用的有效块,可以用来分割文本(如果可能的话)。
如何使用这些块拆分给定文本,例如结果将根据生成的块数进行优化(最小化)?
TEST SUITE
if __name__ == "__main__":
import random
import sys
random.seed(1)
# 1) Testing robustness
examples = []
sys.stdout.write("Testing correctness...")
N = 50
large_number = "3141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086513282306647093844609550582231725359408128481"
for i in range(100):
for j in range(i):
choices = random.sample(range(i), j)
examples.append((choices, large_number))
for (choices, large_number) in examples:
get_it_done(choices, large_number)
sys.stdout.write("OK")
# 2) Testing correctness
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
# Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
# Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
# Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
# Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]
score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag
print(
"Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))
# 3) TODO: Testing optimization, it should provide (if possible)
# minimal cases
问题
如何在不使用强力方法的情况下在python上解决这个问题?
答案 0 :(得分:7)
使用动态编程,您可以构建列表(l0, l1, l2, ... ln-1)
,其中n
是输入字符串中的字符数,li
是您需要到达的最小块数输入字符串的字符i
。整体结构如下:
minValues := list with n infinity entries
for i from 0 to n-1
for every choice c that is a suffix of input[0..i]
if i - len(c) < 0
newVal = 1
else
newVal = minValues[i - len(c)] + 1
end if
if(newVal < minValues[i])
minValues[i] = newVal
//optionally record the used chunk
end if
next
next
然后,整个字符串的最小块数为ln-1
。您可以通过跟踪列表(需要记录使用过的块)来获取实际的块。
使用trie(反向选择字符串)可以加速检索作为后缀的选项。最差情况的复杂性仍为O(n * c * lc)
,其中n
是输入字符串的长度,c
是选择的数量,lc
是最大长度选择。但是,这种复杂性仅适用于嵌套后缀的选项(例如0
,10
,010
,0010
...)。在这种情况下,trie将退化为列表。平均而言,运行时间应该少得多。假设从trie中检索到的选择的数量总是一个很小的常数,它是O(n * lc)
(实际上,lc
因子可能也更小。)
以下是一个例子:
choices = ["0","1","10","100"]
text = "10010"
algorithm step content of minValues
0 1 2 3 4
---------------------------------------------------------
initialize (∞, ∞ , ∞ , ∞ , ∞ )
i = 0, c = "1" (1 "1", ∞ , ∞ , ∞ , ∞ )
i = 1, c = "0" (1 "1", 2 "0", ∞ , ∞ , ∞ )
i = 1, c = "10" (1 "1", 1 "10", ∞ , ∞ , ∞ )
i = 2, c = "0" (1 "1", 1 "10", 2 "0", ∞ , ∞ )
i = 2, c = "100" (1 "1", 1 "10", 1 "100", ∞ , ∞ )
i = 3, c = "1" (1 "1", 1 "10", 1 "100", 2 "1", ∞ )
i = 4, c = "0" (1 "1", 1 "10", 1 "100", 2 "1", 3 "0" )
i = 4, c = "10" (1 "1", 1 "10", 1 "100", 2 "1", 2 "10")
含义:我们可以用2个块组成字符串。追溯以相反的顺序给出块:“10”,“100”。
答案 1 :(得分:4)
对不起,实施有点hacky。但我认为它总能找到最佳答案。 (虽然没有证明。)这是python中一个快速而完整的实现,并为所有提议的用例返回正确的答案。
该算法是递归的,其工作原理如下:
算法完成后,所有可能的路径(以及不可能的路径,即末尾不匹配)应该只被遍历一次。
为了有效地执行第2步,我为选择构建了一个patricia树,以便可以快速查找匹配文本开头的可能块。
def get_seq_in_tree(tree, choice):
if type(tree)!=dict:
if choice == tree:
return [choice]
return []
for i in range(1, len(choice)+1):
if choice[:i] in tree:
return [choice[:i]] + get_seq_in_tree(tree[choice[:i]], choice[i:])
return []
def seq_can_end_here(tree, seq):
res = []
last = tree
for e, c in enumerate(seq):
if '' in last[c]:
res.append(e+1)
last = last[c]
return res
def build_tree(choices):
tree = {}
choices = sorted(choices)
for choice in choices:
last = tree
for c in choice:
if c not in last:
last[c] = {}
last = last[c]
last['']=None
return tree
solution_cache = {}
ncalls = 0
def solve(tree, number):
global solution_cache
global ncalls
ncalls +=1
# take every path only once
if number in solution_cache:
return solution_cache[number]
solutions = []
seq = get_seq_in_tree(tree, number)
endings = seq_can_end_here(tree, seq)
for i in reversed(endings):
current_solution = []
current_solution.append(number[:i])
if i == len(number):
solutions.append(current_solution)
else:
next_solution = solve(tree, number[i:])
if next_solution:
solutions.append(current_solution + next_solution)
if not solutions:
return None
shortest_solution = sorted([(len(solution), solution) for solution in solutions])[0][1]
solution_cache[number] = shortest_solution
return shortest_solution
def get_it_done(choices, number):
tree = build_tree(choices)
solution = solve(tree, number)
return solution
if __name__ == "__main__":
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
## Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
## Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
### Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
## Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]
score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag
print("Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))
我猜复杂性类似于O(L * N * log(C)),其中L是文本的长度,N是词汇量的大小,C是选择的数量。
编辑:包含缺少的测试用例。
答案 2 :(得分:2)
def find_shortest_path(graph, start, end, path=[]):
path = path + [start]
if start == end:
return path
if start not in graph:
return None
shortest = None
for node in graph[start]:
if node not in path:
newpath = find_shortest_path(graph, node, end, path)
if newpath:
if not shortest or len(newpath) < len(shortest):
shortest = newpath
return shortest
def get_it_done(choices, number):
mapping = {}
graph = {}
for choice in choices:
if choice in number:
_from = number.index(choice)
_to = _from + len(choice)
mapping.setdefault((_from, _to), choice)
items = sorted(mapping.items(), key=lambda x: x[0])
for _range, value in items:
_from, _to = _range
graph.setdefault(_from, []).append(_to)
start = 0
end = _range[1] #this is hack, works only in python 2.7
path = find_shortest_path(graph, start, end)
ranges = [tuple(path[i:i+2]) for i in range(len(path) - 1)]
if len(ranges) == 1:
items = sorted(choices, key=len, reverse=True)
number_length = len(number)
result = ''
for item in items:
result += item
if len(result) == number_length:
return result
return [mapping[_range] for _range in ranges]
if __name__ == "__main__":
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
## Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
## Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
### Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
## Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
),
# Example6
## Solution ['100', '10']
(
["0", "1", "10", "100"],
"10010"
)
]
score = 0
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
flag = "".join(res) == large_number
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, flag))
print('-' * 80)
score += flag
print(
"Score: {0}/{1} = {2:.2f}%".format(score, len(examples), score / len(examples) * 100))
get_it_done
函数首先在mapping
创建,其中键是choice
中每个number
的出现范围。然后按mapping
dict的每个键中的第一项对其进行排序。下一步是创建graph
。然后使用find_shortest_path
函数,我们可以找到以最佳方式构建结果的最短路径。最后,我们可以再次使用mapping
,根据其范围返回choices
。如果有一个范围,我们就会遇到所有数字都包含相同的两个值的情况,因此规则是不同的。我们可以直接从choices
(按降序排序)收集数字,直到结果的长度与number
的长度相同。
答案 3 :(得分:-3)
def find_shortest_path(graph, start, end, path=[]):
path = path + [start]
if start == end:
return path
if start not in graph:
return None
shortest = None
for node in graph[start]:
if node not in path:
newpath = find_shortest_path(graph, node, end, path)
if newpath:
if not shortest or len(newpath) < len(shortest):
shortest = newpath
return shortest
def get_it_done(choices, number):
mapping = {}
graph = {}
for choice in choices:
if choice in number:
_from = number.index(choice)
_to = _from + len(choice)
mapping.setdefault((_from, _to), choice)
items = sorted(mapping.items(), key=lambda x: x[0])
for _range, value in items:
_from, _to = _range
graph.setdefault(_from, []).append(_to)
start = 0
end = _range[1] #this is hack, works only in python 2.7
path = find_shortest_path(graph, start, end)
ranges = [tuple(path[i:i+2]) for i in range(len(path) - 1)]
if len(ranges) == 1:
return [mapping[(start, graph[start][-1])]]
return [mapping[_range] for _range in ranges]
if __name__ == "__main__":
examples = [
# Example1 ->
# Solution ['012345678910203040506070', '80', '90', '100', '200', '300', '400', '500', '600', '700', '800', '900']
(
[
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"100", "200", "300", "400", "500", "600", "700", "800", "900",
"012345678910203040506070"
],
"0123456789102030405060708090100200300400500600700800900"
),
## Example2
## Solution ['100']
(
["0", "1", "10", "100"],
"100"
),
## Example3
## Solution ['101234567891020304050', '6070809010020030040050', '0600700800900']
(
[
"10", "20", "30", "40", "50", "60", "70", "80", "90",
"012345678910203040506070",
"101234567891020304050",
"6070809010020030040050",
"0600700800900"
],
"10123456789102030405060708090100200300400500600700800900"
),
### Example4
### Solution ['12', '34', '56', '78', '90']
(
[
"12", "34", "56", "78", "90",
"890",
],
"1234567890"
),
## Example5
## Solution ['12', '34']
(
[
"1", "2", "3",
"12", "23", "34"
],
"1234"
)
]
for (choices, large_number) in examples:
res = get_it_done(choices, large_number)
print("{0}\n{1}\n{2} --> {3}".format(
large_number, "".join(res), res, "".join(res) == large_number))
print('-' * 80)