我正在使用此方法从字符串中删除重音:
static string RemoveAccents(string input)
{
string normalized = input.Normalize(NormalizationForm.FormKD);
StringBuilder builder = new StringBuilder();
foreach (char c in normalized)
{
if (char.GetUnicodeCategory(c) !=
UnicodeCategory.NonSpacingMark)
{
builder.Append(c);
}
}
return builder.ToString();
}
但是这个方法将đ留作đ并且不会将其更改为d,即使d是其基本字符。 你可以用这个输入字符串“æøåáâăäĺćçčéęěěîďđńňóôőöřůúűüýţ”尝试它。
字母đ中有什么特别之处?
答案 0 :(得分:13)
为什么不起作用的答案是“d是其基本字符”的语句为false。 U + 0111(LATIN SMALL LETTER D WITH STROKE)具有Unicode类别“Letter,Lowercase”并且没有分解映射(即,它不会分解为“d”,后跟组合标记)。
"đ".Normalize(NormalizationForm.FormD)
只返回"đ"
,它不会被循环删除,因为它不是非间距标记。
“ø”和其他Unicode不提供分解映射的字母也存在类似的问题。 (如果你试图找到代表Unicode字母的“最佳”ASCII字符,这种方法根本不适用于西里尔文,希腊文,中文或其他非拉丁字母;你也会遇到问题,如果例如,你想将“ß”音译成“ss”。使用像UnidecodeSharp这样的库可能会有所帮助。)
答案 1 :(得分:3)
我必须承认,我不确定为什么会这样,但确实似乎
var str = "æøåáâăäĺćçčéęëěíîďđńňóôőöřůúűüýţ";
var noApostrophes = Encoding.ASCII.GetString(Encoding.GetEncoding("Cyrillic").GetBytes(str));
=> “aoaaaaalccceeeeiiddnnooooruuuuyt”
答案 2 :(得分:3)
“D with stroke”(维基百科)以多种语言使用,并且在所有语言中看起来都被视为一个独特的字母 - 这就是它保持不变的原因。
答案 3 :(得分:0)
string.Normalize(NormalizationForm)
是删除“真实”双字(Wiki)的简便方法,但是您可能要转换的许多字母不受此影响。
我对Ð和ð(字母Eth),đ,Æ和æ有类似的问题。要将它们转换为ANSI(拉丁语),请改用Unicode转换!
private static char[] ConvertUnicodeStringToSpecificEncoding(string input, int resultEncodingCode)
{
System.Text.Encoding unicodeEncoding = System.Text.Encoding.Unicode;
System.Text.Encoding specificEncoding = System.Text.Encoding.GetEncoding(resultEncodingCode);
byte[] convertedBytes = System.Text.Encoding.Convert(unicodeEncoding, specificEncoding, unicodeEncoding.GetBytes(input));
char[] convertedChars = new char[specificEncoding.GetCharCount(convertedBytes, 0, convertedBytes.Length)];
specificEncoding.GetChars(convertedBytes, 0, convertedBytes.Length, convertedChars, 0);
return convertedChars;
}
在同一字符串上使用多种编码调用此方法,以在要保留的字母上创建一个交集。
编码列表: https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding?view=netframework-4.8
我的解决方案如下
// Encoding Types (int Codes) https://docs.microsoft.com/en-us/dotnet/api/system.text.encoding?view=netframework-4.8
private static readonly char[] charactersToSkip = new char[] { 'ä', 'ö', 'ü', 'Ä', 'Ö', 'Ü' };
private static readonly char[] specialCharsToSkip = new char[] { '^', '´', '`', '°', '!', '\'', '§', '$', '%', '&', '/', '(', ')', '=', '{', '[', ']', '}', '\\', '+', '-' };
private static readonly char[] ambiguousCharsToSkip = new char[] { '?' }; // Chars which might be a result of encoding-conversion and have to be skipped beforehand.
private static readonly int[] encodingsToRemoveDiacritics = new int[]
{
852, // 852 ibm852 Central European (DOS)
850, // 850 ibm850 Western European (DOS)
860, // 860 IBM860 Portuguese (DOS)
/* Warning:
* Only append encodings.
* Changing sort order of encodings may result in malfunctioning.
*/
};
public static string RemoveDiacritics(this string inputString)
{
if (string.IsNullOrEmpty(inputString))
{
return inputString;
}
var resultStringBuilder = new StringBuilder();
foreach (char currentChar in inputString)
{
if (charactersToSkip.Contains(currentChar) || specialCharsToSkip.Contains(currentChar) || ambiguousCharsToSkip.Contains(currentChar))
{
resultStringBuilder.Append(currentChar);
continue;
}
string normalizedString = currentChar.ToString().Normalize(NormalizationForm.FormD);
foreach (char normalizedChar in normalizedString)
{
if (System.Globalization.CharUnicodeInfo.GetUnicodeCategory(normalizedChar) != System.Globalization.UnicodeCategory.NonSpacingMark)
{
string convertedString = normalizedChar.ToString();
char[] convertedChars = null;
foreach (int encodingCode in encodingsToRemoveDiacritics)
{
convertedChars = ConvertUnicodeStringToSpecificEncoding(convertedString, encodingCode);
if (convertedChars.Contains('?') == false)
{
convertedString = new string(convertedChars);
}
}
resultStringBuilder.Append(convertedString);
}
}
}
return resultStringBuilder.ToString();
}
创建以下输出
"abcdefghijklmnopqrstuvwxzy" -> "abcdefghijklmnopqrstuvwxzy"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" -> "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"1234567890" -> "1234567890"
"ß" -> "ß"
"ÄÖÜ" -> "ÄÖÜ"
"äöü" -> "äöü"
"!\"§$%&/()=?" -> "!\"§$%&/()=?"
"+-_~'*#" -> "+-_~'*#"
",.;:" -> ",.;:"
"µ" -> "u" // My -> u
"<>|" -> "<>|"
"´`^°" -> "´`^°"
"²" -> "2" // ² -> 2
"³" -> "3" // ³ -> 3
"{}" -> "{}"
"[]" -> "[]"
"\\" -> "\\"
"áàâã" -> "aaaa"
"ÁÀÂÅ" -> "AAAA"
"éèêę" -> "eeee"
"ÉÈÊĚ" -> "EEEE"
"íìîï" -> "iiii"
"ÍÌÎ" -> "III"
"óòôõ" -> "oooo"
"ÓÒÔŌ" -> "OOOO"
"úùû" -> "uuu"
"ÚÙÛ" -> "UUU"
"ÇĆĈČĊ" -> "CCCCC"
"çćĉčċ" -> "ccccc"
"Ñ" -> "N"
"Æ" -> "A"
"æ" -> "a"
"ýÿ" -> "yy"
"ĹĻĽ" -> "LLL"
"Ð" -> "D"
"đ" -> "d"
"ð" -> "d"
答案 4 :(得分:-4)
这应该有效
private static String RemoveDiacritics(string text)
{
String normalized = text.Normalize(NormalizationForm.FormD);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < normalized.Length; i++)
{
Char c = normalized[i];
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
sb.Append(c);
}
return sb.ToString();
}