C#中的Jaro-Winkler距离算法

时间:2013-10-01 18:39:55

标签: string comparison distance jaro-winkler

如何在C#中实现Jaro-Winkler距离串比较算法?

4 个答案:

答案 0 :(得分:34)

public static class JaroWinklerDistance
{
    /* The Winkler modification will not be applied unless the 
     * percent match was at or above the mWeightThreshold percent 
     * without the modification. 
     * Winkler's paper used a default value of 0.7
     */
    private static readonly double mWeightThreshold = 0.7;

    /* Size of the prefix to be concidered by the Winkler modification. 
     * Winkler's paper used a default value of 4
     */
    private static readonly int mNumChars = 4;


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (perfect match) to 1 (no match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double distance(string aString1, string aString2) {
        return 1.0 - proximity(aString1,aString2);
    }


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (no match) to 1 (perfect match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double proximity(string aString1, string aString2)
    {
        int lLen1 = aString1.Length;
        int lLen2 = aString2.Length;
        if (lLen1 == 0)
            return lLen2 == 0 ? 1.0 : 0.0;

        int  lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);

        // default initialized to false
        bool[] lMatched1 = new bool[lLen1];
        bool[] lMatched2 = new bool[lLen2];

        int lNumCommon = 0;
        for (int i = 0; i < lLen1; ++i) {
            int lStart = Math.Max(0,i-lSearchRange);
            int lEnd = Math.Min(i+lSearchRange+1,lLen2);
            for (int j = lStart; j < lEnd; ++j) {
                if (lMatched2[j]) continue;
                if (aString1[i] != aString2[j])
                    continue;
                lMatched1[i] = true;
                lMatched2[j] = true;
                ++lNumCommon;
                break;
            }
        }
        if (lNumCommon == 0) return 0.0;

        int lNumHalfTransposed = 0;
        int k = 0;
        for (int i = 0; i < lLen1; ++i) {
            if (!lMatched1[i]) continue;
            while (!lMatched2[k]) ++k;
            if (aString1[i] != aString2[k])
                ++lNumHalfTransposed;
            ++k;
        }
        // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
        int lNumTransposed = lNumHalfTransposed/2;

        // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
        double lNumCommonD = lNumCommon;
        double lWeight = (lNumCommonD/lLen1
                         + lNumCommonD/lLen2
                         + (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;

        if (lWeight <= mWeightThreshold) return lWeight;
        int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
        int lPos = 0;
        while (lPos < lMax && aString1[lPos] == aString2[lPos])
            ++lPos;
        if (lPos == 0) return lWeight;
        return lWeight + 0.1 * lPos * (1.0 - lWeight);

    }


}

答案 1 :(得分:1)

你可以看看Lucene.Net,

它实现了Jaro-Winkler距离算法,

并且其分数与leebickmtu帖子不同,

你可以把它作为参考

网址如下:

http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html

答案 2 :(得分:0)

使用以下类来使用jaro winkler。 我已经定制了算法jaro和jaro-winkler。

访问Github获取DLL。

using System;
using System.Linq;

namespace Search
{
    public static class EditDistance
    {
        private struct JaroMetrics
        {
            public int Matches;

            public int Transpositions;
        }

        private static EditDistance.JaroMetrics Matches(string s1, string s2)
        {
            string text;
            string text2;
            if (s1.Length > s2.Length)
            {
                text = s1;
                text2 = s2;
            }
            else
            {
                text = s2;
                text2 = s1;
            }
            int num = Math.Max(text.Length / 2 - 1, 0);
            int[] array = new int[text2.Length];
            int i;
            for (i = 0; i < array.Length; i++)
            {
                array[i] = -1;
            }
            bool[] array2 = new bool[text.Length];
            int num2 = 0;
            for (int j = 0; j < text2.Length; j++)
            {
                char c = text2[j];
                int k = Math.Max(j - num, 0);
                int num3 = Math.Min(j + num + 1, text.Length);
                while (k < num3)
                {
                    if (!array2[k] && c == text[k])
                    {
                        array[j] = k;
                        array2[k] = true;
                        num2++;
                        break;
                    }
                    k++;
                }
            }
            char[] array3 = new char[num2];
            char[] ms2 = new char[num2];
            i = 0;
            int num4 = 0;
            while (i < text2.Length)
            {
                if (array[i] != -1)
                {
                    array3[num4] = text2[i];
                    num4++;
                }
                i++;
            }
            i = 0;
            num4 = 0;
            while (i < text.Length)
            {
                if (array2[i])
                {
                    ms2[num4] = text[i];
                    num4++;
                }
                i++;
            }
            int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
            EditDistance.JaroMetrics result;
            result.Matches = num2;
            result.Transpositions = num5 / 2;
            return result;
        }



        public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
        {
            prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
            prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
            float num = s1.Jaro(s2);
            int num2 = 0;
            for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
            {
                if (s1[i] != s2[i])
                {
                    break;
                }
                num2++;
            }
            return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
        }

        public static float JaroWinkler(this string s1, string s2, float prefixScale)
        {
            return s1.JaroWinkler(s2, prefixScale, 0.7f);
        }

        public static float JaroWinkler(this string s1, string s2)
        {
            return s1.JaroWinkler(s2, 0.1f, 0.7f);
        }

        public static float Jaro(this string s1, string s2)
        {
            EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
            float num = (float)jaroMetrics.Matches;
            int transpositions = jaroMetrics.Transpositions;
            float result;
            if (num == 0f)
            {
                result = 0f;
            }
            else
            {
                float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
                result = num2;
            }
            return result;
        }

        public static int LevenshteinDistance(this string source, string target)
        {
            int result;
            if (string.IsNullOrEmpty(source))
            {
                if (string.IsNullOrEmpty(target))
                {
                    result = 0;
                }
                else
                {
                    result = target.Length;
                }
            }
            else if (string.IsNullOrEmpty(target))
            {
                result = source.Length;
            }
            else
            {
                if (source.Length > target.Length)
                {
                    string text = target;
                    target = source;
                    source = text;
                }
                int length = target.Length;
                int length2 = source.Length;
                int[,] array = new int[2, length + 1];
                for (int i = 1; i <= length; i++)
                {
                    array[0, i] = i;
                }
                int num = 0;
                for (int j = 1; j <= length2; j++)
                {
                    num = (j & 1);
                    array[num, 0] = j;
                    int num2 = num ^ 1;
                    for (int i = 1; i <= length; i++)
                    {
                        int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
                        array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
                    }
                }
                result = array[num, length];
            }
            return result;
        }
    }
}

答案 3 :(得分:0)

您可以使用以下代码对所有类型的字符串都非常有效。获得结果后,您需要乘以100以获得相似百分比。我希望它能解决您的问题。

    public class JaroWinkler
{
    private const double defaultMismatchScore = 0.0;
    private const double defaultMatchScore = 1.0;

    /// <summary>
    /// Gets the similarity between two strings by using the Jaro-Winkler algorithm.
    /// A value of 1 means perfect match. A value of zero represents an absolute no match
    /// </summary>
    /// <param name="_firstWord"></param>
    /// <param name="_secondWord"></param>
    /// <returns>a value between 0-1 of the similarity</returns>
    /// 
    public static double RateSimilarity(string _firstWord, string _secondWord)
    {
        // Converting to lower case is not part of the original Jaro-Winkler implementation
        // But we don't really care about case sensitivity in DIAMOND and wouldn't decrease security names similarity rate just because
        // of Case sensitivity
        _firstWord = _firstWord.ToLower();
        _secondWord = _secondWord.ToLower();

        if ((_firstWord != null) && (_secondWord != null))
        {
            if (_firstWord == _secondWord)
                //return (SqlDouble)defaultMatchScore;
                return defaultMatchScore;
            else
            {
                // Get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
                int halfLength = Math.Min(_firstWord.Length, _secondWord.Length) / 2 + 1;

                // Get common characters
                StringBuilder common1 = GetCommonCharacters(_firstWord, _secondWord, halfLength);
                int commonMatches = common1.Length;

                // Check for zero in common
                if (commonMatches == 0)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                StringBuilder common2 = GetCommonCharacters(_secondWord, _firstWord, halfLength);

                // Check for same length common strings returning 0 if is not the same
                if (commonMatches != common2.Length)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                // Get the number of transpositions
                int transpositions = 0;
                for (int i = 0; i < commonMatches; i++)
                {
                    if (common1[i] != common2[i])
                        transpositions++;
                }

                int j = 0;
                j += 1;

                // Calculate Jaro metric
                transpositions /= 2;
                double jaroMetric = commonMatches / (3.0 * _firstWord.Length) + commonMatches / (3.0 * _secondWord.Length) + (commonMatches - transpositions) / (3.0 * commonMatches);
                //return (SqlDouble)jaroMetric;
                return jaroMetric;
            }
        }

        //return (SqlDouble)defaultMismatchScore;
        return defaultMismatchScore;
    }

    /// <summary>
    /// Returns a string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1.
    /// </summary>
    /// <param name="firstWord">string one</param>
    /// <param name="secondWord">string two</param>
    /// <param name="separationDistance">separation distance</param>
    /// <returns>A string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1</returns>
    private static StringBuilder GetCommonCharacters(string firstWord, string secondWord, int separationDistance)
    {
        if ((firstWord != null) && (secondWord != null))
        {
            StringBuilder returnCommons = new StringBuilder(20);
            StringBuilder copy = new StringBuilder(secondWord);
            int firstWordLength = firstWord.Length;
            int secondWordLength = secondWord.Length;

            for (int i = 0; i < firstWordLength; i++)
            {
                char character = firstWord[i];
                bool found = false;

                for (int j = Math.Max(0, i - separationDistance); !found && j < Math.Min(i + separationDistance, secondWordLength); j++)
                {
                    if (copy[j] == character)
                    {
                        found = true;
                        returnCommons.Append(character);
                        copy[j] = '#';
                    }
                }
            }
            return returnCommons;
        }
        return null;
    }
}