DotNet Soundex功能

时间:2012-06-20 14:35:22

标签: .net soundex

我有一个数据库表,其中有一列SQLServer Soundex编码的姓氏+名字。 在我的C#程序中,我想使用soundex转换一个字符串,以便在我的查询中使用。

在dotnet库中是否存在soundex的标准字符串函数,或者是实现它的开源库(可能作为字符串的扩展方法)?

4 个答案:

答案 0 :(得分:5)

我知道这已经晚了,但我也需要类似的东西(虽然没有涉及数据库),唯一的答案是不准确的('Tymczak'和'Pfister'的失败)。

这就是我提出的:

class Program
{
    public static void Main(string[] args)
    {
                Assert.AreEqual(Soundex.Generate("H"), "H000");
                Assert.AreEqual(Soundex.Generate("Robert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
                Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
                Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
                Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
                Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
                Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
                Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
                Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
                Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
                Assert.AreEqual(Soundex.Generate("Sword"), "S630");
                Assert.AreEqual(Soundex.Generate("Sord"), "S630");
                Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
                Assert.AreEqual(Soundex.Generate("Logout"), "L230");
                Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
    }
}

public static class Soundex
{
    public const string Empty = "0000";

    private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
    private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
    private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);

    public static string Generate(string Phrase)
    {
        // Remove non-alphas
        Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);

        // Nothing to soundex, return empty
        if (string.IsNullOrEmpty(Phrase))
            return Empty;

        // Convert consonants to numerical representation
        var Numified = Numify(Phrase);

        // Remove repeated numberics (characters of the same sound class), even if separated by H or W
        Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");

        if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
        {
            // Remove first numeric as first letter in same class as subsequent letters
            Numified = Numified.Substring(1);
        }

        // Remove vowels
        Numified = RemoveVowelSounds.Replace(Numified, string.Empty);

        // Concatenate, pad and trim to ensure X### format.
        return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
    }

    private static string Numify(string Phrase)
    {
        return new string(Phrase.ToCharArray().Select(Numify).ToArray());
    }

    private static char Numify(char Character)
    {
        switch (Character)
        {
            case 'B': case 'F': case 'P': case 'V':
                return '1';
            case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
                return '2';
            case 'D': case 'T':
                return '3';
            case 'L':
                return '4';
            case 'M': case 'N':
                return '5';
            case 'R':
                return '6';
            default:
                return Character;
        }
    }
}

答案 1 :(得分:0)

根据wikipedia

中描述的算法
    private string Soundex(string word)
    {
        word = word.ToUpper();
        word = word[0] + 
            Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(word.Substring(1), "[AEIOUYHW]",""),
                "[BFPV]+", "1"),
                "[CGJKQSXZ]+", "2"),
                "[DT]+","3"),
                "[L]+","4"),
                "[MN]+","5"),
                "[R]+","6")
            ;
        return word.PadRight(4,'0').Substring(0,4);
    }

答案 2 :(得分:0)

你可以在c#per SQL中使用这样的东西

public static string Soundex(string data)
    {
        StringBuilder result = new StringBuilder();

        if (data != null && data.Length > 0)
        {
            string previousCode = "", currentCode = "", currentLetter = "";

            result.Append(data.Substring(0, 1));

            for (int i = 1; i < data.Length; i++) 
            {
                currentLetter = data.Substring(i, 1).ToLower();
                currentCode = "";

                if ("bfpv".IndexOf(currentLetter) > -1)
                    currentCode = "1";

                else if ("cgjkqsxz".IndexOf(currentLetter) > -1)
                    currentCode = "2";

                else if ("dt".IndexOf(currentLetter) > -1)
                    currentCode = "3";


                else if (currentLetter == "l")
                    currentCode = "4";

                else if ("mn".IndexOf(currentLetter) > -1)
                    currentCode = "5";

                else if (currentLetter == "r")
                    currentCode = "6";

                if (currentCode != previousCode)
                    result.Append(currentCode);

                if (result.Length == 4) break;

                if (currentCode != "")
                    previousCode = currentCode;

            }
        }
        if (result.Length < 4)
            result.Append(new String('0', 4 - result.Length));

        return result.ToString().ToUpper();
    }

答案 3 :(得分:0)

基于Dotnet Servicestigrou的答案,我更正了该算法,以反映Wikipedia中描述的功能。

Ashcraft = A226,Tymczak = T522,Pfister = P236和Honeyman = H555等测试用例现在可以正常工作。

public static string Soundex(string data)
{
    StringBuilder result = new StringBuilder();

    if (data != null && data.Length > 0)
    {
        string previousCode = "", currentCode = "", currentLetter = "";
        result.Append(data[0]); // keep initial char

        for (int i = 0; i < data.Length; i++) //start at 0 in order to correctly encode "Pf..."
        {
            currentLetter = data[i].ToString().ToLower();
            currentCode = "";

            if ("bfpv".Contains(currentLetter)) 
                currentCode = "1";
            else if ("cgjkqsxz".Contains(currentLetter))
                currentCode = "2";
            else if ("dt".Contains(currentLetter))
                currentCode = "3";
            else if (currentLetter == "l")
                currentCode = "4";
            else if ("mn".Contains(currentLetter))
                currentCode = "5";
            else if (currentLetter == "r")
                currentCode = "6";

            if (currentCode != previousCode && i > 0) // do not add first code to result string
                result.Append(currentCode);

            if (result.Length == 4) break;

            previousCode = currentCode; // always retain previous code, even empty
        }
    }
    if (result.Length < 4)
        result.Append(new String('0', 4 - result.Length));

    return result.ToString().ToUpper();
}