尝试从UTF8文件中提取int时,我一直在苦苦挣扎:
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;
int main()
{
ifstream file("UTF8.txt");
if(file.is_open())
{
string line;
getline(file, line);
istringstream ss(line);
int a;
ss >> a;
if(ss.fail())
{
cout << "Error parsing" << endl;
ss.clear();
}
getline(file, line);
cout << a << endl << line << endl;
file.close();
}
}
该文件包含2行:“42”和“è_é”,并以记事本形式保存为UTF8。当文件是ANSI时,上述工作,但当它是Unicode时失败。我已经尝试了很多东西,最有希望的是设置语言环境,但我希望程序独立于计算机的语言环境(即使PC是美国的,也要阅读中文字符)。 老实说,我现在没有想法了。如果可能的话,我想避免使用Qt中的CStrings。
以下显示“0”,“解析错误”,因为文件的最开头有一个奇怪的字符。一个空行,在读取时丢弃,就在数字使其工作之前但我无法在最终程序中更改文件。口音在控制台中没有正确显示,但是当我将输出写入文件时,一切都很好,这就是我所需要的。所以这只是文件开头的问题!
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.srt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
以下工作,输入文件内容如下:
5
bla bla é_è
6
truc è_é
代码:
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::ifstream file("UTF8.txt");
std::ofstream fileO("UTF8_copy.txt");
if(!file || !fileO) {
std::cout << "Error opening files" << std::endl;
}
else {
std::string line;
//Parse the first number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
//Discard empty line, copy it in the output file
std::getline(file, line);
fileO << std::endl;
//Parse the second number
std::getline(file, line);
{
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
std::cout << "Number 1: " << a << std::endl;
fileO << a << std::endl;
}
//Copy the following line as is
std::getline(file, line);
fileO << line << std::endl;
file.close();
fileO.close();
}
return 0;
}
答案 0 :(得分:2)
以上链接中的示例:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
int main()
{
// UTF-8 data with BOM
std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
// read the UTF8 file, skipping the BOM
std::wifstream fin("text.txt");
fin.imbue(std::locale(fin.getloc(),
new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
for (wchar_t c; fin.get(c); )
std::cout << std::hex << std::showbase << c << '\n';
}
请注意std::consume_header
设置。
根据您的问题改编:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::ifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::string line;
std::getline(file,line);
std::istringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::cout << "Error parsing" << std::endl;
ss.clear();
}
getline(file,line);
std::cout << a << std::endl << line << std::endl;
file.close();
}
}
或wchar_t
:
#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>
int main()
{
std::wifstream file("UTF8.txt");
file.imbue(std::locale(file.getloc(),
new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
if (file.is_open()) {
std::wstring line;
std::getline(file,line);
std::wistringstream ss{line};
int a;
ss >> a;
if (ss.fail()) {
std::wcout << L"Error parsing" << std::endl;
ss.clear();
}
std::getline(file,line);
std::wcout << a << std::endl << line << std::endl;
file.close();
}
}
答案 1 :(得分:2)
只需跳过前导BOM(字节顺序标记):
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
// Do not get used to it:
// using namespace std;
inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
s += 3;
return s;
}
int main()
{
std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
std::string line;
getline(file, line);
const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
std::istringstream input(linePtr);
int a = -1;
input >> a;
if( ! input) {
std::cout << "Error parsing" << std::endl;
}
getline(file, line);
std::cout << a << std::endl << line << std::endl;
}