我在Ruby中找到了后缀数组的实现并稍微改了一下。这就是我所拥有的:
class SuffixArray
def initialize(str)
@string = str
@suffix_array = []
(0...str.length).each do |i|
substring = @string[i...str.length]
@suffix_array << {:suffix=>substring, :index => i}
end
@sorted_suffix_array = @suffix_array.sort {|x,y| x[:suffix] <=> y[:suffix]}
end
def print_sorted
@sorted_suffix_array.each {|item| puts "#{item[:index]}=>#{item[:suffix]}"}
puts "total=>#{@sorted_suffix_array.size()}"
end
def print_unsorted
@suffix_array.each {|item| puts "#{item[:index]}=>#{item[:suffix]}"}
puts "total=>#{@suffix_array.size()}"
end
def find_substring(substring)
low = 0
high = @sorted_suffix_array.length
while(low <= high) do
mid = (low + high) / 2
comparison = @sorted_suffix_array[mid][:suffix]#[0..substring.length]
if comparison > substring
high = mid - 1
elsif comparison < substring
low = mid + 1
else
return @sorted_suffix_array[mid][:index]
end
end
end
end
它运作良好,但找不到我想要的所有子串。例如
a = SuffixArray.new("there is a man who likes dogs")
puts a.find_substring("man") #won't be found
puts a.find_substring("who likes dogs") #will be found
puts a.find_substring("o likes dogs") #will be found
如何更改算法以使其找到我想要的所有子字符串?
答案 0 :(得分:1)
您的代码几乎是正确的。我做了一些小修改,但它确实有效。
def find_substring(substring)
low = 0
high = @sorted_suffix_array.length-1
while(low <= high) do
mid = (low + high) / 2
comparison = @sorted_suffix_array[mid][:suffix][0...substring.length]
if comparison > substring
high = mid - 1
elsif comparison < substring
low = mid + 1
else
return @sorted_suffix_array[mid][:index]
end
end
end
答案 1 :(得分:1)
对于其他人;引用,这是没有将子字符串保存在哈希
中的引用要点:https://gist.github.com/bluetwin/5268722
class SuffixArray
attr_reader :suf, :string
def initialize(string)
@string = string
@suf = (0..string.size-1).sort_by{|i|@string[i..-1]}
end
def substring(idx)
@string[@suf[idx]..@string.size-1]
end
def bsearch(str)
low = 0
high = @suf.length-1
found = nil
while(low <= high) do
mid = (low + high) / 2
comp = substring(mid)
if comp > str
high = mid - 1
elsif comp < str
low = mid + 1
else
found = comp
low = high + 1
end
end
found
end
end