我有一个问题要解决给定字符串source
和搜索条件criteria
的集合,算法必须返回包含所有项目的source
的最短子字符串criteria
。
=================================
更新
hello world
==================================
String source = "aaa wwwww fgffsd ththththt sss sgsgsgsghs bfbfb hhh sdfg kkk dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss", "hhh" };
上面的输入应返回以下子字符串:kkk wdwd aaa vcvc hhh zxzx sss
不幸的是,我花了很多时间尝试编写这样的算法,但我无法正确理解它。下面是我到目前为止的代码:
public struct Extraction
{
public int Start { get; set; }
public int End { get; set; }
public int Length
{
get
{
var length = this.End - this.Start;
return length;
}
}
public Extraction(int start, int end)
{
this.Start = start;
this.End = end;
}
}
public class TextExtractor
{
private String _source;
private Dictionary<String, List<Int32>> _criteriaIndexes;
private Dictionary<String, int> _entryIndex;
public TextExtractor(String source, List<String> searchCriteria)
{
this._source = source;
this._criteriaIndexes = this.ExtractIndexes(source, searchCriteria);
this._entryIndex = _criteriaIndexes.ToDictionary(x => x.Key, v => 0);
}
public String Extract()
{
List<Extraction> possibleExtractions = new List<Extraction>();
int index = 0;
int min = int.MaxValue;
int max = 0;
bool shouldStop = false;
while (index < _criteriaIndexes.Count && !shouldStop)
{
Boolean compareWithAll = index == _criteriaIndexes.Count - 1;
if (!compareWithAll)
{
var current = _criteriaIndexes.ElementAt(index);
this.CalculateMinMax(current, ref min, ref max);
index++;
}
else
{
var entry = _criteriaIndexes.Last();
while (_entryIndex[entry.Key] < entry.Value.Count)
{
int a = min;
int b = max;
this.CalculateMinMax(entry, ref a, ref b);
_entryIndex[entry.Key]++;
Extraction ext = new Extraction(a, b);
possibleExtractions.Add(ext);
}
int k = index - 1;
while (k >= 0)
{
var prev = _criteriaIndexes.ElementAt(k);
if (prev.Value.Count - 1 > _entryIndex[prev.Key])
{
_entryIndex[prev.Key]++;
break;
}
else
{
k--;
}
}
shouldStop = _criteriaIndexes.All(x => x.Value.Count - 1 <= _entryIndex[x.Key]);
_entryIndex[entry.Key] = 0;
index = 0;
min = int.MaxValue;
max = 0;
}
}
Extraction shortest = possibleExtractions.First(x => x.Length.Equals(possibleExtractions.Min(p => p.Length)));
String result = _source.Substring(shortest.Start, shortest.Length);
return result;
}
private Dictionary<String, List<Int32>> ExtractIndexes(String source, List<String> searchCriteria)
{
Dictionary<String, List<Int32>> result = new Dictionary<string, List<int>>();
foreach (var criteria in searchCriteria)
{
Int32 i = 0;
Int32 startingIndex = 0;
var indexes = new List<int>();
while (i > -1)
{
i = source.IndexOf(criteria, startingIndex);
if (i > -1)
{
startingIndex = i + 1;
indexes.Add(i);
}
}
if (indexes.Any())
{
result.Add(criteria, indexes);
}
}
return result;
}
private void CalculateMinMax(KeyValuePair<String, List<int>> current, ref int min, ref int max)
{
int j = current.Value[_entryIndex[current.Key]];
if (j < min)
{
min = j;
}
int indexPlusWordLength = j + current.Key.Length;
if (indexPlusWordLength > max)
{
max = indexPlusWordLength;
}
}
}
如果有人能指出我在算法中出错的地方,我将不胜感激。而且,我觉得这是一个非常天真的实现。也许有一种更好的方法来解决这个问题,而不是试图尝试组合索引?
谢谢!
答案 0 :(得分:2)
这是一个更简单的算法,可以为您提供最短的子串。
void Main()
{
String source = "aaa wwwww fgffsd ththththt sss ww sgsgsgsghs bfbfb hhh sdfg kkk " +
"dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss ww nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss ww", "hhh" };
var result = GetAllSubstringContainingCriteria(source, criteria)
.OrderBy(sub => sub.Length).FirstOrDefault();
// result is "kkk wdwd aaa vcvc hhh zxzx sss ww"
}
private IEnumerable<string> GetAllSubstringContainingCriteria(
string source, List<string> criteria)
{
for (int i = 0; i < source.Length; i++)
{
var subString = source.Substring(i);
if (criteria.Any(crit => subString.StartsWith(crit)))
{
var lastWordIndex =
GetLastCharacterIndexFromLastCriteriaInSubstring(subString, criteria);
if (lastWordIndex >= 0)
yield return string.Join(" ", subString.Substring(0, lastWordIndex));
}
else
continue;
}
}
private int GetLastCharacterIndexFromLastCriteriaInSubstring(
string subString, List<string> criteria)
{
var results = criteria.Select(crit => new {
index = subString.IndexOf(crit),
criteria = crit});
return results.All(result => result.index >= 0)
? results.Select(result => result.index + result.criteria.Length).Max()
: -1;
}
答案 1 :(得分:0)
让Java内置类完成工作。如何将您的标准转换为正则表达式模式。如果标准是X或Y或Z. 。 。,将其转换为“(X)|(Y)|(Z)| ...”形式的正则表达式,编译它,并对源字符串执行它。
当然,这会返回最左边的匹配。你可以编写一个非常直接的循环来迭代所有的事件,缓存它们,然后选择最短的 - 或最左边的最短的 - 或者,如果两个或更多个同样短,那么就是所有这些。