Question

尝试从UTF8文件中提取int时，我一直在苦苦挣扎：

#include <iostream>
#include <fstream>
#include <sstream>

using namespace std;

int main()
{
    ifstream file("UTF8.txt");
    if(file.is_open())
    {
        string line;
        getline(file, line);
        istringstream ss(line);
        int a;
        ss >> a;
        if(ss.fail())
        {
            cout << "Error parsing" << endl;
            ss.clear();
        }
        getline(file, line);
        cout << a << endl << line << endl;
        file.close();
    }
}

该文件包含2行：“42”和“è_é”，并以记事本形式保存为UTF8。当文件是ANSI时，上述工作，但当它是Unicode时失败。我已经尝试了很多东西，最有希望的是设置语言环境，但我希望程序独立于计算机的语言环境（即使PC是美国的，也要阅读中文字符）。老实说，我现在没有想法了。如果可能的话，我想避免使用Qt中的CStrings。

更新

以下显示“0”，“解析错误”，因为文件的最开头有一个奇怪的字符。一个空行，在读取时丢弃，就在数字使其工作之前但我无法在最终程序中更改文件。口音在控制台中没有正确显示，但是当我将输出写入文件时，一切都很好，这就是我所需要的。所以这只是文件开头的问题！

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.srt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

解

以下工作，输入文件内容如下：

5
bla bla é_è

6
truc è_é

代码：

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}

int main()
{
    std::ifstream file("UTF8.txt");
    std::ofstream fileO("UTF8_copy.txt");
    if(!file || !fileO) {
        std::cout << "Error opening files" << std::endl;
    }
    else {
        std::string line;

        //Parse the first number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        //Discard empty line, copy it in the output file
        std::getline(file, line);
        fileO << std::endl;

        //Parse the second number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        file.close();
        fileO.close();
    }

    return 0;
}

Answer 1

阅读文件with std::codecvt_mode

以上链接中的示例：

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>

int main()
{
    // UTF-8 data with BOM
    std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
    // read the UTF8 file, skipping the BOM
    std::wifstream fin("text.txt");
    fin.imbue(std::locale(fin.getloc(),
                          new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
    for (wchar_t c; fin.get(c); )
        std::cout << std::hex << std::showbase << c << '\n';
}

请注意std::consume_header设置。

根据您的问题改编：

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

或wchar_t：

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::wifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::wstring line;
        std::getline(file,line);
        std::wistringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::wcout << L"Error parsing" << std::endl;
            ss.clear();
        }
        std::getline(file,line);
        std::wcout << a << std::endl << line << std::endl;
        file.close();
    }
}

Answer 2

只需跳过前导BOM（字节顺序标记）：

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}


int main()
{
    std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
    std::string line;
    getline(file, line);
    const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
    std::istringstream input(linePtr);
    int a = -1;
    input >> a;
    if( ! input) {
        std::cout << "Error parsing" << std::endl;
    }
    getline(file, line);
    std::cout << a << std::endl << line << std::endl;
}

UTF8字符串为int

更新

解

2 个答案: