查找一组字符串的第一个公共子字符串

时间:2014-01-24 20:01:54

标签: c# set-theory longest-substring

我正在寻找First Common Substring的实现

Mike is not your average guy. I think you are great.
Jim is not your friend. I think you are great.
Being different is not your fault. I think you are great.

使用最长公共子串实现(并忽略标点符号),你会得到“我认为你很棒”,但我在寻找第一个出现的公共子串,在这个例子中:

is not your

也许是一个生成和排序所有常见子串的列表的实现,我可以从中获取第一个。

修改

被比较的代币将是完整的单词。寻找第一个最长整个单词序列的贪婪匹配。 (假设在方法中使用了后缀树,树的每个节点都是一个单词)

2 个答案:

答案 0 :(得分:3)

有很多步骤可以做到这一点。

  1. 删除标点符号
  2. 将句子分解为单词列表
  3. 创建所有连续单词组合的字符串(min:1,max:wordCount)
  4. 在新的字符串(子句)
  5. 列表中加入三个列表
  6. 相应排序。
  7. 代码:

    static void Main(string[] args)
    {
        var sentence1 = "Mike is not your average guy. I think you are great.";
        var sentence2 = "Jim is not your friend. I think you are great.";
        var sentence3 = "Being different is not your fault. I think you are great.";
    
        //remove all punctuation 
        // http://stackoverflow.com/questions/421616
        sentence1 = new string(
          sentence1.Where(c => !char.IsPunctuation(c)).ToArray());
        sentence2 = new string(
          sentence2.Where(c => !char.IsPunctuation(c)).ToArray());
        sentence3 = new string(
          sentence3.Where(c => !char.IsPunctuation(c)).ToArray());
    
        //seperate into words
        var words1 = sentence1.Split(new char[] { ' ' },
          StringSplitOptions.RemoveEmptyEntries).ToList();
        var words2 = sentence2.Split(new char[] { ' ' },          
          StringSplitOptions.RemoveEmptyEntries).ToList();
        var words3 = sentence3.Split(new char[] { ' ' }, 
          StringSplitOptions.RemoveEmptyEntries).ToList();
    
        //create substring list
        var subSentence1 = CreateSubstrings(words1);
        var subSentence2 = CreateSubstrings(words2);
        var subSentence3 = CreateSubstrings(words3);
    
        //join then like a Sql Table
        var subSentences = subSentence1
            .Join(subSentence2,
                sub1 => sub1.Value,
                sub2 => sub2.Value,
                (sub1, sub2) => new { Sub1 = sub1, 
                                      Sub2 = sub2 })
            .Join(subSentence3,
                sub1 => sub1.Sub1.Value,
                sub2 => sub2.Value,
                (sub1, sub2) => new { Sub1 = sub1.Sub1, 
                                      Sub2 = sub1.Sub2, 
                                      Sub3 = sub2 })
            ;
    
        //Sorted by Lowest Index, then by Maximum Words
        subSentences = subSentences.OrderBy(s => s.Sub1.Rank)
          .ThenByDescending(s => s.Sub1.Length)
          .ToList();
    
        //Sort by Maximum Words, then Lowest Index
        /*subSentences = subSentences.OrderByDescending(s => s.Sub1.Length)
            .ThenBy(s => s.Sub1.Rank)
            .ToList();//*/
    
        foreach (var subSentence in subSentences)
        {
            Console.WriteLine(subSentence.Sub1.Length.ToString() + " " 
              + subSentence.Sub1.Value);
            Console.WriteLine(subSentence.Sub2.Length.ToString() + " " 
              + subSentence.Sub2.Value);
            Console.WriteLine(subSentence.Sub3.Length.ToString() + " " 
              + subSentence.Sub3.Value);
            Console.WriteLine("======================================");
        }
    
        Console.ReadKey();
    
    }
    
    //this could probably be done better -Erik
    internal static List<SubSentence> CreateSubstrings(List<string> words)
    {
        var result = new List<SubSentence>();
        for (int wordIndex = 0; wordIndex < words.Count; wordIndex++)
        {
            var sentence = new StringBuilder();
            int currentWord = wordIndex;
            while (currentWord < words.Count - 1)
            {
                sentence.Append(words.ElementAt(currentWord));
                result.Add(new SubSentence() { Rank = wordIndex, 
                  Value = sentence.ToString(), 
                  Length = currentWord - wordIndex + 1 });
                sentence.Append(' ');
                currentWord++;
            }
            sentence.Append(words.Last());
            result.Add(new SubSentence() { Rank = wordIndex, 
              Value = sentence.ToString(), 
              Length = words.Count - wordIndex });
        }
        return result;
    }
    
    internal class SubSentence
    {
        public int Rank { get; set; }
        public string Value { get; set; }
        public int Length { get; set; }
    }
    

    结果:

      

    3不是你的

         

    3不是你的

         

    3不是你的

         

    ======================================

         

    2不是

         

    2不是

         

    2不是

         

    ======================================

         

    1是

         

    1是

         

    1是

         

    ======================================

         

    2不是你的

         

    2不是你的

         

    2不是你的

         

    ======================================

         

    1不是

         

    1不是

         

    1不是

         

    ======================================

         

    1你的

         

    1你的

         

    1你的

         

    ======================================

         

    5我觉得你很棒

         

    5我觉得你很棒

         

    5我觉得你很棒

         

    ======================================

         

    4我认为你是

         

    4我认为你是

         

    4我认为你是

         

    ======================================

         

    3我想你

         

    3我想你

         

    3我想你

         

    ======================================

         

    2我想

         

    2我想

         

    2我想

         

    ======================================

         

    1我

         

    1我

         

    1我

         

    ======================================

         

    4认为你很棒

         

    4认为你很棒

         

    4认为你很棒

         

    ======================================

         

    3认为你是

         

    3认为你是

         

    3认为你是

         

    ======================================

         

    2认为你

         

    2认为你

         

    2认为你

         

    ======================================

         

    1想

         

    1想

         

    1想

         

    ======================================

         

    3你很棒

         

    3你很棒

         

    3你很棒

         

    ======================================

         

    2你是

         

    2你是

         

    2你是

         

    ======================================

         

    1你

         

    1你

         

    1你

         

    ======================================

         

    2很棒

         

    2很棒

         

    2很棒

         

    ======================================

         

    1是

         

    1是

         

    1是

         

    ======================================

         

    1很棒

         

    1很棒

         

    1很棒

         

    ======================================

答案 1 :(得分:0)

这里有一些可以做你想要的东西。你实际上会调整为预先构建你的字符串列表,传入它并且它会为你找到...在这个例子中,短语将基于最短字符串作为基线的字符串。

public void SomeOtherFunc()
{
    List<string> MyTest = new List<string>();
    MyTest.Add( "Mike is not your average guy. I think you are great." );
    MyTest.Add( "Jim is not your friend. I think you are great." );
    MyTest.Add( "Being different is not your fault. I think you are great." );

    string thePhrase = testPhrase( MyTest );
    MessageBox.Show( thePhrase );
}



public string testPhrase(List<string> test)
{

    // start with the first string and find the shortest.
    // if we can't find a short string in a long, we'll never find a long string in short
    // Ex "To testing a string that is longer than some other string" 
    // vs "Im testing a string that is short"
    // Work with the shortest string.
    string shortest = test[0];
    string lastGoodPhrase = "";
    string curTest;
    int firstMatch = 0;
    int lastMatch = 0;
    int allFound;
    foreach (string s in test)
        if (s.Length < shortest.Length)
            shortest = s;

    // Now, we need to break the shortest string into each "word"
    string[] words = shortest.Split( ' ' );

    // Now, start with the first word until it is found in ALL phrases
    for (int i = 0; i < words.Length; i++)
    {
        // to prevent finding "this" vs "is"
        lastGoodPhrase = " " + words[i] + " ";

        allFound = 0;
        foreach (string s in test)
        {
            // always force leading space for string
            if ((" "+s).Contains(lastGoodPhrase))
               allFound++;
            else
               // if not found in ANY string, its not found in all, get out
               break;
        }

        if (allFound == test.Count)
        {
            // we've identified the first matched field, get out for next phase test
            firstMatch = i;
            // also set the last common word to the same until we can test next...
            lastMatch = i;
            break;
        }
    }

    // if no match, get out
    if (firstMatch == 0)
        return "";

    // we DO have at least a first match, now keep looking into each subsequent
    // word UNTIL we no longer have a match.
    for( int i = 1; i < words.Length - firstMatch; i++ )
    {
        // From where the first entry was, build out the ENTIRE PHRASE
        // until the end of the original sting of words and keep building 1 word back
        curTest = " ";
        for (int j = firstMatch; j <= firstMatch + i; j++)
            curTest += words[j] + " ";

        // see if all this is found in ALL strings
        foreach (string s in test)
            // we know we STARTED with a valid found phrase.
            // as soon as a string NO LONGER MATCHES the new phrase,
            // return the last VALID phrase
            if (!(" " + s).Contains(curTest))
                return lastGoodPhrase;

        // if this is still a good phrase, set IT as the newest
        lastGoodPhrase = curTest;
    }

    return lastGoodPhrase;
}