Substract包含所有搜索条件的最短字符串

时间:2016-06-30 22:03:06

标签: c# string algorithm search

我有一个问题要解决给定字符串source和搜索条件criteria的集合,算法必须返回包含所有项目的source的最短子字符串criteria

=================================

更新

  • 相同的搜索条件可能位于源字符串multiple中 倍。在这种情况下,需要返回子字符串 包含搜索条件的特定实例,以便 它是所有可能的子串中最短的。
  • 搜索项目中可以包含空格,例如hello world
  • 找到搜索条件的顺序无关紧要,只要它们都在结果子字符串中

==================================

String source = "aaa wwwww fgffsd ththththt sss sgsgsgsghs bfbfb hhh sdfg kkk dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss", "hhh" };

上面的输入应返回以下子字符串:kkk wdwd aaa vcvc hhh zxzx sss

不幸的是,我花了很多时间尝试编写这样的算法,但我无法正确理解它。下面是我到目前为止的代码:

public struct Extraction
{
    public int Start { get; set; }
    public int End { get; set; }
    public int Length
    {
        get
        {
            var length = this.End - this.Start;
            return length;
        }
    }

    public Extraction(int start, int end)
    {
        this.Start = start;
        this.End = end;
    }
}

public class TextExtractor
{
    private String _source;
    private Dictionary<String, List<Int32>> _criteriaIndexes;
    private Dictionary<String, int> _entryIndex;

    public TextExtractor(String source, List<String> searchCriteria)
    {
        this._source = source;
        this._criteriaIndexes = this.ExtractIndexes(source, searchCriteria);
        this._entryIndex = _criteriaIndexes.ToDictionary(x => x.Key, v => 0);
    }

    public String Extract()
    {
        List<Extraction> possibleExtractions = new List<Extraction>();

        int index = 0;
        int min = int.MaxValue;
        int max = 0;
        bool shouldStop = false;
        while (index < _criteriaIndexes.Count && !shouldStop)
        {
            Boolean compareWithAll = index == _criteriaIndexes.Count - 1;
            if (!compareWithAll)
            {
                var current = _criteriaIndexes.ElementAt(index);
                this.CalculateMinMax(current, ref min, ref max);
                index++;
            }
            else
            {
                var entry = _criteriaIndexes.Last();
                while (_entryIndex[entry.Key] < entry.Value.Count)
                {
                    int a = min;
                    int b = max;
                    this.CalculateMinMax(entry, ref a, ref b);

                    _entryIndex[entry.Key]++;
                    Extraction ext = new Extraction(a, b);
                    possibleExtractions.Add(ext);
                }
                int k = index - 1;

                while (k >= 0)
                {
                    var prev = _criteriaIndexes.ElementAt(k);
                    if (prev.Value.Count - 1 > _entryIndex[prev.Key])
                    {
                        _entryIndex[prev.Key]++;
                        break;
                    }
                    else
                    {
                        k--;
                    }
                }
                shouldStop = _criteriaIndexes.All(x => x.Value.Count - 1 <= _entryIndex[x.Key]);
                _entryIndex[entry.Key] = 0;
                index = 0;
                min = int.MaxValue;
                max = 0;
            }
        }

        Extraction shortest = possibleExtractions.First(x => x.Length.Equals(possibleExtractions.Min(p => p.Length)));
        String result = _source.Substring(shortest.Start, shortest.Length);
        return result;
    }

    private Dictionary<String, List<Int32>> ExtractIndexes(String source, List<String> searchCriteria)
    {
        Dictionary<String, List<Int32>> result = new Dictionary<string, List<int>>();
        foreach (var criteria in searchCriteria)
        {
            Int32 i = 0;
            Int32 startingIndex = 0;
            var indexes = new List<int>();
            while (i > -1)
            {
                i = source.IndexOf(criteria, startingIndex);
                if (i > -1)
                {
                    startingIndex = i + 1;
                    indexes.Add(i);
                }
            }
            if (indexes.Any())
            {
                result.Add(criteria, indexes);
            }

        }
        return result;
    }

    private void CalculateMinMax(KeyValuePair<String, List<int>> current, ref int min, ref int max)
    {
        int j = current.Value[_entryIndex[current.Key]];
        if (j < min)
        {
            min = j;
        }
        int indexPlusWordLength = j + current.Key.Length;
        if (indexPlusWordLength > max)
        {
            max = indexPlusWordLength;
        }
    }
}

如果有人能指出我在算法中出错的地方,我将不胜感激。而且,我觉得这是一个非常天真的实现。也许有一种更好的方法来解决这个问题,而不是试图尝试组合索引?

谢谢!

2 个答案:

答案 0 :(得分:2)

这是一个更简单的算法,可以为您提供最短的子串。

void Main()
{
    String source = "aaa wwwww fgffsd ththththt sss ww sgsgsgsghs bfbfb hhh sdfg kkk " +
        "dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss ww nbnbn";
    List<String> criteria = new List<string> { "kkk", "aaa", "sss ww", "hhh" };
    var result = GetAllSubstringContainingCriteria(source, criteria)
        .OrderBy(sub => sub.Length).FirstOrDefault();
    // result is "kkk wdwd aaa vcvc hhh zxzx sss ww"
}

private IEnumerable<string> GetAllSubstringContainingCriteria(
    string source, List<string> criteria)
{
    for (int i = 0; i < source.Length; i++)
    {
        var subString = source.Substring(i);
        if (criteria.Any(crit => subString.StartsWith(crit)))
        {
            var lastWordIndex = 
                GetLastCharacterIndexFromLastCriteriaInSubstring(subString, criteria);
            if (lastWordIndex >= 0)
                yield return string.Join(" ", subString.Substring(0, lastWordIndex));
        }
        else
            continue;
    }
}

private int GetLastCharacterIndexFromLastCriteriaInSubstring(
    string subString, List<string> criteria)
{
    var results = criteria.Select(crit => new { 
            index = subString.IndexOf(crit),
            criteria = crit});

    return results.All(result => result.index >= 0)
        ? results.Select(result => result.index + result.criteria.Length).Max()
        : -1;
}

答案 1 :(得分:0)

让Java内置类完成工作。如何将您的标准转换为正则表达式模式。如果标准是X或Y或Z. 。 。,将其转换为“(X)|(Y)|(Z)| ...”形式的正则表达式,编译它,并对源字符串执行它。

当然,这会返回最左边的匹配。你可以编写一个非常直接的循环来迭代所有的事件,缓存它们,然后选择最短的 - 或最左边的最短的 - 或者,如果两个或更多个同样短,那么就是所有这些。