UTF8字符串为int

时间:2016-03-17 12:04:19

标签: c++ stl utf

尝试从UTF8文件中提取int时,我一直在苦苦挣扎:

#include <iostream>
#include <fstream>
#include <sstream>

using namespace std;

int main()
{
    ifstream file("UTF8.txt");
    if(file.is_open())
    {
        string line;
        getline(file, line);
        istringstream ss(line);
        int a;
        ss >> a;
        if(ss.fail())
        {
            cout << "Error parsing" << endl;
            ss.clear();
        }
        getline(file, line);
        cout << a << endl << line << endl;
        file.close();
    }
}

该文件包含2行:“42”和“è_é”,并以记事本形式保存为UTF8。当文件是ANSI时,上述工作,但当它是Unicode时失败。我已经尝试了很多东西,最有希望的是设置语言环境,但我希望程序独立于计算机的语言环境(即使PC是美国的,也要阅读中文字符)。 老实说,我现在没有想法了。如果可能的话,我想避免使用Qt中的CStrings。

更新

以下显示“0”,“解析错误”,因为文件的最开头有一个奇怪的字符。一个空行,在读取时丢弃,就在数字使其工作之前但我无法在最终程序中更改文件。口音在控制台中没有正确显示,但是当我将输出写入文件时,一切都很好,这就是我所需要的。所以这只是文件开头的问题!

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.srt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

以下工作,输入文件内容如下:

5
bla bla é_è

6
truc è_é

代码:

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}

int main()
{
    std::ifstream file("UTF8.txt");
    std::ofstream fileO("UTF8_copy.txt");
    if(!file || !fileO) {
        std::cout << "Error opening files" << std::endl;
    }
    else {
        std::string line;

        //Parse the first number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        //Discard empty line, copy it in the output file
        std::getline(file, line);
        fileO << std::endl;

        //Parse the second number
        std::getline(file, line);
        {
            const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
            std::istringstream input(linePtr);
            int a = -1;
            input >> a;
            if( ! input) {
                std::cout << "Error parsing" << std::endl;
            }
            std::cout << "Number 1: " << a << std::endl;
            fileO << a << std::endl;
        }

        //Copy the following line as is
        std::getline(file, line);
        fileO << line << std::endl;

        file.close();
        fileO.close();
    }

    return 0;
}

2 个答案:

答案 0 :(得分:2)

阅读文件with std::codecvt_mode

以上链接中的示例:

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>

int main()
{
    // UTF-8 data with BOM
    std::ofstream("text.txt") << u8"\ufeffz\u6c34\U0001d10b";
    // read the UTF8 file, skipping the BOM
    std::wifstream fin("text.txt");
    fin.imbue(std::locale(fin.getloc(),
                          new std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header>));
    for (wchar_t c; fin.get(c); )
        std::cout << std::hex << std::showbase << c << '\n';
}

请注意std::consume_header设置。

根据您的问题改编:

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::ifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<char,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::string line;
        std::getline(file,line);
        std::istringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::cout << "Error parsing" << std::endl;
            ss.clear();
        }
        getline(file,line);
        std::cout << a << std::endl << line << std::endl;
        file.close();
    }
}

wchar_t

#include <fstream>
#include <iostream>
#include <string>
#include <locale>
#include <codecvt>
#include <sstream>

int main()
{
    std::wifstream file("UTF8.txt");
    file.imbue(std::locale(file.getloc(),
        new std::codecvt_utf8<wchar_t,0x10ffff,std::consume_header>));
    if (file.is_open()) {
        std::wstring line;
        std::getline(file,line);
        std::wistringstream ss{line};
        int a;
        ss >> a;
        if (ss.fail()) {
            std::wcout << L"Error parsing" << std::endl;
            ss.clear();
        }
        std::getline(file,line);
        std::wcout << a << std::endl << line << std::endl;
        file.close();
    }
}

答案 1 :(得分:2)

只需跳过前导BOM(字节顺序标记):

#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>

// Do not get used to it:
// using namespace std;

inline const char* skip_utf8_bom(const char* s, std::size_t size)
{
    if(3 <= size && s[0] == char(0xEF) && s[1] == char(0xBB) && s[2] == char(0xBF))
        s += 3;
    return s;
}


int main()
{
    std::istringstream file(u8"\xEF\xBB\xBF""42\n\u00E8_\u00E9\n");
    std::string line;
    getline(file, line);
    const char* linePtr = skip_utf8_bom(line.c_str(), line.size());
    std::istringstream input(linePtr);
    int a = -1;
    input >> a;
    if( ! input) {
        std::cout << "Error parsing" << std::endl;
    }
    getline(file, line);
    std::cout << a << std::endl << line << std::endl;
}