
时间:2018-04-10 23:46:51

标签: c# arrays regex list

我从外部来源获得List strings始终更改。

我想搜索每个字符串,在所有字符串之间找到匹配的单词in sequence




这本名为“指环王”的书是经典之作   这本名为“战争与和平”的书是经典之作   这本名为The Three Musketeers的书是经典之作。

The book named将被删除 is a classic.将被删除 The book named The序列未被移除,因为War and Peace不以The开头。







我去了家得宝   我去了Walgreens   我去了百思买。

I went to已删除。


篮球队洛杉矶湖人队是我的最爱   篮球队纽约尼克斯队是我的最爱   篮球队芝加哥公牛队是我的最爱。

The basketball team已删除 are my favorite.已被删除。







List<string> sentences = new List<string>() 
    "The book named The Lord of the Rings is a classic.",
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",

List<string> titles = new List<string>() 

for (int i = 0; i < sentences.Count; i++)
    // Add Titles to their own List

String FindTitle(string sentence) 
    string title = string.Empty;

    // compare all strings in List
    // group common word sequences prefix (The book named)
    // group common word sequences suffix (is a classic.)
    // remove those word sequences from each string in List

    return title;

2 个答案:

答案 0 :(得分:1)

这是我的方法。我采取了性能路线 - 我猜仍然可以优化。


使用秒表来计算我的v / s Rufus L'的解决方案。

enter image description here

使用 - 鲁弗斯的测试句输入:

private static List<List<string>> GetTestSentences()
    return new List<List<string>>
        new List<string>()
            "The book named The Lord of the Rings is a classic.",
            "The book named War and Peace is a classic.",
            "The book named The Three Musketeers is a classic.",
        new List<string>
            "I went to The Home Depot.",
            "I went to Walgreens.",
            "I went to Best Buy."
        new List<string>
            "The basketball team Los Angeles Lakers are my favorite.",
            "The basketball team New York Knicks are my favorite.",
            "The basketball team Chicago Bulls are my favorite."
        new List<string>()
            "The book named Lord of the Flies is a classic (500 This is a test)",
            "The book named Wuthering Heights is a classic (500 This is a test)",
            "The book named Great Expectations is a classic (500 This is a test)",
            "The book named The Lord of the Rings is a classic (500 This is a test)",
            "The book named War and Peace is a classic (500 This is a test)"


foreach (var sentenceList in GetTestSentences())
    var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
    var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);

    if (prefix.Length > 0)
        prefix = Regex.Escape(prefix);
    if (suffix.Length > 0)
        suffix = Regex.Escape(suffix);

    foreach (var item in sentenceList)
        var result = Regex.Replace(item, prefix, string.Empty);
        result = Regex.Replace(result, suffix, string.Empty);
        Console.WriteLine($"{item} --> {result}");
    Console.WriteLine(new string('-', Console.WindowWidth));


private static string FindMatchingPattern(string sample1, string sample2, bool forwardDirection)
    string shorter = string.Empty;
    string longer = string.Empty;

    if (sample1.Length <= sample2.Length)
        shorter = sample1;
        longer = sample2;
        shorter = sample2;
        longer = sample1;

    StringBuilder matchingPattern = new StringBuilder();
    StringBuilder wordHolder = new StringBuilder();

    if (forwardDirection)
        for (int idx = 0; idx < shorter.Length; idx++)
            if (shorter[idx] == longer[idx])
                if (shorter[idx] == ' ')
                    matchingPattern.Append(wordHolder + " ");
        while (true)
            if (shorter.Length > 0 && shorter[shorter.Length - 1] == longer[longer.Length - 1])
                if (shorter[shorter.Length - 1] == ' ')
                    matchingPattern.Insert(0, " " + wordHolder);
                    wordHolder.Insert(0, shorter[shorter.Length - 1]);

                shorter = shorter.Remove(shorter.Length - 1, 1);
                longer = longer.Remove(longer.Length - 1, 1);

    return matchingPattern.ToString();

答案 1 :(得分:1)




new List<string>()
    "The book named Lord of the Rings 2 is a classic.",
    "The book named Lord of the Flies 2 is a classic.",
    "The book named This is pretty is a classic.",                
    "The book named War and Peace is a classic.",
    "The book named The Three Musketeers is a classic.",                

在这里,如果我们只比较前两个句子,我们确定公共前缀是"The book named Lord of the",这是不正确的。我们还确定公共后缀为"2 is a classic.",这也是不正确的。


public static List<string> RemoveCommonPrefixAndSuffix(List<string> sentences,
    int minSeqenceLength = 2)
    if (sentences == null) return null;

    if (sentences.Count < 2 ||
        sentences.Any(s => s.Count(c => c == ' ') < minSeqenceLength - 1))
        return sentences.ToList();

    if (sentences.All(s => s == sentences[0]))
        return sentences.Select(s => string.Empty).ToList();

    var sentenceWords = sentences.Select(s => s.Split()).ToList();
    var firstSentence = sentenceWords[0];
    var length = sentenceWords.Min(s => s.Length);
    var commonPrefix = new StringBuilder();
    var commonSuffix = new StringBuilder();
    var prefixDone = false;
    var suffixDone = false;

    for (var i = 0; i < length && !(prefixDone && suffixDone); i++)
        if (!prefixDone && sentenceWords.All(s => s[i] == firstSentence[i]))
            commonPrefix.Append(firstSentence[i] + " ");
            prefixDone = true;

        if (!suffixDone && sentenceWords.All(s =>
            s[s.Length - i - 1] == firstSentence[firstSentence.Length - i - 1]))
            commonSuffix.Insert(0, firstSentence[firstSentence.Length - i - 1] + " ");
            suffixDone = true;

    var prefix = commonPrefix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonPrefix.ToString()
        : string.Empty;

    var suffix = commonSuffix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
        ? commonSuffix.ToString()
        : string.Empty;

    var commonLength = prefix.Length + suffix.Length;

    return sentences
        .Select(s => s.Length > commonLength
            ? s.Substring(prefix.Length, s.Length - prefix.Length - suffix.Length)
            : string.Empty)


private static List<List<string>> GetTestSentences()
    return new List<List<string>>
        // Prefix-only test
        new List<string>
            "I went to The Home Depot",
            "I went to Walgreens",
            "I went to Best Buy",
        // Suffix-only test
        new List<string>
            "Game of Thrones is a good TV series",
            "Breaking Bad is a good TV series",
            "The Office is a good TV series",
        // Prefix / Suffix test
        new List<string>
            "The basketball team Los Angeles Lakers are my favorite",
            "The basketball team New York Knicks are my favorite",
            "The basketball team Chicago Bulls are my favorite",
        // No prefix or suffix - all sentences are different
        new List<string>
            "I went to The Home Depot",
            "Game of Thrones is a good TV series",
            "The basketball team Los Angeles Lakers are my favorite",
        // All sentences are the same - no "topic" between prefix and suffix
        new List<string>()
            "These sentences are all the same",
            "These sentences are all the same",
            "These sentences are all the same",
        // Some sentences have no content between prefix and suffix
        new List<string>()
            "This sentence has no topic",
            "This sentence [topic here] has no topic",
            "This sentence has no topic",
            "This sentence [another one] has no topic",
        // First two topics have common beginnings
        new List<string>()
            "The book named Lord of the Rings is a classic",
            "The book named Lord of the Flies is a classic",
            "The book named This is pretty is a classic",
            "The book named War and Peace is a classic",
            "The book named The Three Musketeers is a classic",
        // The first two topics have a common ending
        new List<string>
            "The movie named Matrix 2 is very good",
            "The movie named Avatar 2 is very good",
            "The movie named The Sound of Music is very good",
            "The movie named Terminator 2 is very good",


private static void Main()
    var sentenceLists = GetTestSentences();
    var padLength = sentenceLists.Max(t => t.Max(s => s.Length)) + 2;
    Console.WriteLine("\nComparison Results\n------------------\n");

    // Rufus' solution
    var sw = Stopwatch.StartNew();
    foreach (var sentenceList in sentenceLists)
        var trimmedSentences = RemoveCommonPrefixAndSuffix(sentenceList);

        for (var j = 0; j < trimmedSentences.Count; j++)
            Console.WriteLine("{0} {1}", sentenceList[j].PadRight(padLength, '.'),


    Console.WriteLine($"Rufus' solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    // Prateek's solution
    foreach (var sentenceList in sentenceLists)
        var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
        var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);

        if (prefix.Length > 0) prefix = Regex.Escape(prefix);
        if (suffix.Length > 0) suffix = Regex.Escape(suffix);

        foreach (var item in sentenceList)
            var result = Regex.Replace(item, prefix, string.Empty);
            result = Regex.Replace(result, suffix, string.Empty);
            Console.WriteLine($"{item.PadRight(padLength, '.')} {result}");


    Console.WriteLine($"Prateek's solution took {sw.ElapsedMilliseconds} ms\n");
    Console.WriteLine(new string('-', Console.WindowWidth));

    GetKeyFromUser("\nDone!! Press any key to exit...");


enter image description here