我知道有很多字符串相似度算法,但我不知道哪个算法最适合我的问题。
我的琴弦长度各不相同,但通常会在一个或另一个我想要的算法中添加一些额外的绒毛,以提供高度相似性"分数"当字符串包含没有拼写错误的相同单词。例如,Stuff and Things corp.
与Stuff and Things corporation
或101, Stuff and Things corporat
或Stuff and Things
相同。
但在我的案例中,字符串color
和colour
,Loremipsum
和Olremipsum
完全不同。我的字符串永远不会有错误输入或交换的字符,字符串长度为1到50个字符。
编辑:
相同单词的顺序非常重要New York city
会有所不同或与York New city
感谢您的帮助
答案 0 :(得分:2)
好的,即使规则仍然不明确,我也会尝试一下。
总结您的要求:
New York
和New Delhi
不是相等 New York city
和York New city
不相等如果没有找到常见的单词序列,方法FindCommonWords
将返回两个句子中常见的单词序列或空序列(Enumerable.Empty<string>
)。
它首先通过预定义的单词分隔符列表将两个字符串拆分为两个string[]
。然后它检查所有“子序列”是否以相同的顺序包含在另一个数组中(使用扩展方法IndexOfSequence
)。
private static readonly char[] wordSeparators = { '\n', '\t', ',', '.', '!', '?', ';', ':', ' ', '-', '/', '\\', '[', ']', '(', ')', '<', '>', '@', '"', '\'' };
public static IEnumerable<string> FindCommonWords(string str1, string str2, StringComparer comparer = null)
{
if (str1 == null)
throw new ArgumentNullException("str1", "Both input strings must not be null!");
if (str2 == null)
throw new ArgumentNullException("str2", "Both input strings must not be null!");
if (comparer == null) comparer = StringComparer.CurrentCulture;
str1 = str1.Trim();
str2 = str2.Trim();
string[] words1 = str1.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
string[] words2 = str2.Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
if(Math.Min(words1.Length, words2.Length) < 2)
return Enumerable.Empty<string>(); // one word is not supposed to be a commnon word sequence
// use for-loop to find the longest common words
for (int wordCount = words1.Length - 1; wordCount >= 2; wordCount--)
{
// scan word-count from left to right
for (int skipCount = 0; wordCount + skipCount <= words1.Length; skipCount++)
{
// take wordCount-words from left side and walk from left to right
IEnumerable<string> wordSeq = words1.Skip(skipCount).Take(wordCount);
// search sequence in other words
int indexInWords2 = words2.IndexOfSequence(wordSeq, comparer);
if (indexInWords2 >= 0)
{
// found match in other words, must be longest common sequence
return wordSeq;
}
}
}
return Enumerable.Empty<string>();
}
这是可能对其他要求有用的扩展名:
public static int IndexOfSequence<TSource>(this IEnumerable<TSource> input, IEnumerable<TSource> sequence, IEqualityComparer<TSource> comparer)
{
if (input == null) throw new ArgumentNullException("input");
if (sequence == null) throw new ArgumentNullException("sequence");
if (!sequence.Any()) throw new ArgumentException("Sequence must not be empty", "sequence");
if (comparer == null)
{
comparer = EqualityComparer<TSource>.Default;
}
int index = -1, firstIndex = -1, lastFoundIndex = -1;
bool found = false;
using (IEnumerator<TSource> enumerator = input.GetEnumerator())
{
using (IEnumerator<TSource> enumerator2 = sequence.GetEnumerator())
{
enumerator2.MoveNext();
while (enumerator.MoveNext())
{
index++;
found = comparer.Equals(enumerator.Current, enumerator2.Current);
if (found && firstIndex == -1)
firstIndex = index;
else if (found && index != lastFoundIndex + 1)
found = false; // sequence must be consecutive
if (found && !enumerator2.MoveNext())
return firstIndex;
if(found)
lastFoundIndex = index;
}
}
}
return -1;
}
以下是您的三个样本:
var commonWords = FindCommonWords(
"Stuff and Things corporation",
"101, Stuff and Things corporat",
StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine(string.Join(" ", commonWords)); // Stuff and Things
commonWords = FindCommonWords(
"101, Stuff and Things corporat",
"or Stuff and Things.",
StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine( string.Join(" ", commonWords) ); // Stuff and Things
commonWords = FindCommonWords(
"New York city",
"York New city",
StringComparer.CurrentCultureIgnoreCase);
Console.WriteLine(string.Join(" ", commonWords)); // empty sequence, no match
请注意,它是从头开始编写的,未经过彻底测试。