Question

我知道有很多字符串相似度算法，但我不知道哪个算法最适合我的问题。

我的琴弦长度各不相同，但通常会在一个或另一个我想要的算法中添加一些额外的绒毛，以提供高度相似性＆＃34;分数＆＃34;当字符串包含没有拼写错误的相同单词。例如，Stuff and Things corp.与Stuff and Things corporation或101, Stuff and Things corporat或Stuff and Things相同。

但在我的案例中，字符串color和colour，Loremipsum和Olremipsum完全不同。我的字符串永远不会有错误输入或交换的字符，字符串长度为1到50个字符。

编辑：相同单词的顺序非常重要New York city会有所不同或与York New city

的相似程度低

感谢您的帮助

Answer 1

好的，即使规则仍然不明确，我也会尝试一下。

总结您的要求：

在另一个句子中找到最长的常用单词序列
至少两个字必须是共同的，因此New York和New Delhi不是相等
订单很重要，因此New York city和York New city不相等

如果没有找到常见的单词序列，方法FindCommonWords将返回两个句子中常见的单词序列或空序列（Enumerable.Empty<string>）。

它首先通过预定义的单词分隔符列表将两个字符串拆分为两个string[]。然后它检查所有“子序列”是否以相同的顺序包含在另一个数组中（使用扩展方法IndexOfSequence）。

private static readonly char[] wordSeparators = { '\n', '\t', ',', '.', '!', '?', ';', ':', ' ', '-', '/', '\\', '[', ']', '(', ')', '<', '>', '@', '"', '\'' };

public static IEnumerable<string> FindCommonWords(string str1, string str2, StringComparer comparer = null)
{
    if (str1 == null)
        throw new ArgumentNullException("str1", "Both input strings must not be null!");
    if (str2 == null)
        throw new ArgumentNullException("str2", "Both input strings must not be null!");

    if (comparer == null) comparer = StringComparer.CurrentCulture;
    str1 = str1.Trim();
    str2 = str2.Trim();

    string[] words1 = str1.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
    string[] words2 = str2.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
    if(Math.Min(words1.Length, words2.Length) < 2)
        return Enumerable.Empty<string>(); // one word is not supposed to be a commnon word sequence

    // use for-loop to find the longest common words
    for (int wordCount = words1.Length - 1; wordCount >= 2; wordCount--)
    {
        // scan word-count from left to right
        for (int skipCount = 0; wordCount + skipCount <= words1.Length; skipCount++)
        {
            // take wordCount-words from left side and walk from left to right
            IEnumerable<string> wordSeq = words1.Skip(skipCount).Take(wordCount);
            // search sequence in other words
            int indexInWords2 = words2.IndexOfSequence(wordSeq, comparer);
            if (indexInWords2 >= 0)
            {
                // found match in other words, must be longest common sequence
                return wordSeq;
            }
        }
    }
    return Enumerable.Empty<string>();
}

这是可能对其他要求有用的扩展名：

public static int IndexOfSequence<TSource>(this IEnumerable<TSource> input, IEnumerable<TSource> sequence, IEqualityComparer<TSource> comparer)
{
    if (input == null) throw new ArgumentNullException("input");
    if (sequence == null) throw new ArgumentNullException("sequence");
    if (!sequence.Any()) throw new ArgumentException("Sequence must not be empty", "sequence");
    if (comparer == null)
    {
        comparer = EqualityComparer<TSource>.Default;
    }
    int index = -1, firstIndex = -1, lastFoundIndex = -1;
    bool found = false;

    using (IEnumerator<TSource> enumerator = input.GetEnumerator())
    {
        using (IEnumerator<TSource> enumerator2 = sequence.GetEnumerator())
        {
            enumerator2.MoveNext();
            while (enumerator.MoveNext())
            {
                index++;
                found = comparer.Equals(enumerator.Current, enumerator2.Current);
                if (found && firstIndex == -1)
                    firstIndex = index;
                else if (found && index != lastFoundIndex + 1)
                    found = false; // sequence must be consecutive
                if (found && !enumerator2.MoveNext())
                    return firstIndex;
                if(found)
                    lastFoundIndex = index;
            }
        }
    }
    return -1;
}

以下是您的三个样本：

var commonWords = FindCommonWords(
     "Stuff and Things corporation", 
     "101, Stuff and Things corporat", 
     StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine(string.Join(" ", commonWords));   // Stuff and Things

commonWords = FindCommonWords(
     "101, Stuff and Things corporat",
     "or Stuff and Things.",
     StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine( string.Join(" ", commonWords) ); // Stuff and Things

commonWords = FindCommonWords(
     "New York city",
     "York New city",
     StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine(string.Join(" ", commonWords));  // empty sequence, no match

请注意，它是从头开始编写的，未经过彻底测试。

字符串相似性，长度不同但单词相同

1 个答案: