我正在尝试用C ++编写程序,通过将每个大写字母转换为小写来规范化字符串。
此外,我正在处理一些特殊字符,因为我的母语是西班牙语,它也应该与西班牙语一起使用。出于某种原因,我从normalize()
返回了一个字符串,但却无法cout
。
所以,为了使它工作,我必须打印它就好像它是一个数组,并且它适用于大多数情况因为我使用word.length()
。但是,当我切换到result.length()
时,它每次都给我一个直的0
。我无法弄清楚问题是什么,也许我必须为结果添加一个null终止符,以便length()
可以完成它的工作?
#include <iostream>
#include <string>
using namespace std;
string normalize(string word)
{
string result;
int j = 0;
for (int i = 0; i < word.length(); i++)
{
if (word[i] >= 'A' && word[i] <= 'Z')
{
result[j] = tolower(word[i]);
}
else
{
if (word[i] == char(0xC3))
{
switch (word[i + 1])
{
case char(0xA1):
word[j] = 'a';
break;
case char(0xA9):
word[j] = 'e';
break;
case char(0xAD):
word[j] = 'i';
break;
case char(0xB3):
word[j] = 'o';
break;
case char(0xBA):
word[j] = 'u';
break;
case char(0xBC):
word[j] = 'u';
break;
case char(0x81):
word[j] = 'a';
break;
case char(0x89):
word[j] = 'e';
break;
case char(0x8D):
word[j] = 'i';
break;
case char(0x93):
word[j] = 'o';
break;
case char(0x9A):
word[j] = 'u';
break;
case char(0x9C):
word[j] = 'u';
break;
}
i++;
}
else
result[j] = result[i];
}
j++;
}
return result;
}
int main()
{
int counter = 0;
string word;
while (cin >> word)
{
counter++;
string result = normalize(word);
cout << counter << ". ";
for (int i = 0; i < result.length(); i++)
{
cout << result[i];
}
cout << endl;
}
return 0;
}
答案 0 :(得分:1)
normalize()
期望UTF-8字符串作为输入。处理&#34;特殊&#34;字符,你根本没有写任何字符到result
,而是将它们写回word
。即使你把它们写到result
,你也没有正确地写它们,因为在填充之前你还没有为result
分配任何内存。您应该使用operator+=
代替operator[]
,或者至少在进入循环之前调用result.resize(word.length())
,然后在退出循环后调用result.resize(j)
。
尝试更像这样的东西:
string normalize(const string &word)
{
string result;
result.reserve(word.length());
int i = 0;
while (i < word.length())
{
char ch = word[i++];
if (ch <= 0x7F)
{
result += (char) tolower(ch);
}
else if ((ch == 0xC3) && (i < word.length()))
{
ch = word[i++];
switch (ch)
{
case 0x81:
case 0xA1:
result += 'a';
break;
case 0x89:
case 0xA9:
result += 'e';
break;
case 0x8D:
case 0xAD:
result += 'i';
break;
case 0x93:
case 0xB3:
result += 'o';
break;
case 0x9A:
case 0x9C:
case 0xBA:
case 0xBC:
result += 'u';
break;
default:
result += '?';
break;
}
}
else
result += '?';
}
return result;
}
然而,话虽如此,normalize()
正在做的事情并不是处理UTF-8的正确方法。您正在寻找的是&#34;音译&#34;,它比您的简单实现更为复杂。您应该使用ICONV或ICU等专用Unicode库。但是如果你打算手动完成,至少要正确解码和处理UTF-8,例如:
string normalize(const string &word)
{
// TODO: normalize word using Unicode Normalization Form NFC first...
string result;
result.reserve(word.length());
int i = 0;
while (i < word.length())
{
uint8_t ch = (uint8_t) word[i++];
int32_t cp;
int count;
if ((ch & 0x80) == 0x00)
{
cp = (ch & 0x7F);
count = 0;
}
else if ((ch & 0xE0) == 0xC0)
{
cp = ch & 0x1F;
count = 1;
}
else if ((ch & 0xF0) == 0xE0)
{
cp = ch & 0x0F;
count = 2;
}
else if ((ch & 0xF8) == 0xF0)
{
cp = ch & 0x07;
count = 3;
}
else
{
result += '?';
continue;
}
bool ok = ((i+count) <= word.length());
for (int j = 0; (ok) && (j < count); ++j)
{
ch = (uint8_t) word[i++];
if ((ch & 0xC0) != 0x80)
{
ok = false;
break;
}
cp <<= 6;
cp |= (ch & 0x3F);
}
if (!ok)
{
result += '?';
}
else
{
switch (cp)
{
case 0x00C1:
case 0x00E1:
result += 'a';
break;
case 0x00C9:
case 0x00E9:
result += 'e';
break;
case 0x00CD:
case 0x00ED:
result += 'i';
break;
case 0x00D3:
case 0x00F3:
result += 'o';
break;
case 0x00DA:
case 0x00DC:
case 0x00FA:
case 0x00FC:
result += 'u';
break;
default:
if (cp <= 0x007F)
result += (char) tolower(cp);
else
result += '?';
break;
}
}
}
return result;
}
或者,如果您使用的是C ++ 11或更高版本:
string normalize(const string &word)
{
u32string u32 = codecvt_utf8<char32_t>{}.from_bytes(word);
// TODO: normalize u32 using Unicode Normalization Form NFC first...
string result;
result.reserve(u32.length());
for (char32_t cp : u32)
{
switch (cp)
{
case 0x00C1:
case 0x00E1:
result += 'a';
break;
case 0x00C9:
case 0x00E9:
result += 'e';
break;
case 0x00CD:
case 0x00ED:
result += 'i';
break;
case 0x00D3:
case 0x00F3:
result += 'o';
break;
case 0x00DA:
case 0x00DC:
case 0x00FA:
case 0x00FC:
result += 'u';
break;
default:
if (cp <= 0x007F)
result += (char) tolower(cp);
else
result += '?';
break;
}
}
return result;
}