Question

我使用正则表达式创建了一个简单的程序来标记文件。对于nonUnicode内容，它工作正常。对于基于Unicode的内容，我制作了一个wregex版本，但是这个版本创建了垃圾输出！

我正在尝试在控制台屏幕上输出Unicode字符或字符串，而是将它们存储在map<wstring,int>和wostream类型的文件中，以便值保持完整和正确。运行应用程序后，包含提取的令牌的文件只包含垃圾！

这个程序有什么问题，我该如何解决？

#include "stdafx.h"

#include <iostream>
#include <regex>
#include <fstream>
#include <string>
#include <map>
using namespace std;

int main()
{
    string path="";    

    map<wstring, int> container;
    wifstream file("ftest.txt"); 
    wregex reg(_T("\\w+"));
    wstring s=_T("");
    while (file.good())
    {
        file>>s;
        for ( wsregex_iterator it (s.begin(), s.end(), reg),it_end; it != it_end; ++it)
        {
            container[(wstring)(*it)[0]]++ ;
        }

    }

    cout <<"\nDone..."<< endl;
    wofstream output("list.txt",ios::app);
    for (auto item : container)
    {
        //cout<<item.first<<" : "<<item.second<<endl;
        output<<item.first<<" : "<<item.second<<endl;
    }
    system("pause");
    return 0;
}

这是ftest.txt：

的内容

بسم الله الرحمن الرحیم 
واشنگتن پست طی گزارشی اعلام کرد کنگره آمریکا برخلاف رویه سابق، ارسال مصوبه سالانه خود در زمینه تحریم های ایران به کاخ سفید را به تاخیر انداخت و به نظر می رسد انتخاب حسن روحانی به عنوان رئیس جمهوری جدید ایران علت این امر بوده است.
0 0 0 نظر
[-]     اندازه متن  [+]


به دنبال انتخاب حسن روحانی به عنوان رئیس جمهوری جدید ایران، کنگره آمریکا بر اساس برخی ملاحظات ارسال مصوبه سالانه خود در زمینه تحریم های ایران به کاخ سفید را به تاخیر انداخت.

这是list.txt

中的垃圾输出

0 : 3
1 : 1
14 : 1
16 : 1
26 : 1
27 : 1
5 : 2
50 : 1
6 : 1
7 : 1
ط : 475
طھ : 12
طھط : 20
طھطµظ : 1
طھظ : 10
طھغ : 2
ط² : 6
ط²ط : 6
ط²ظ : 6
ط³ : 5
ط³ط : 12
ط³طھ : 8
ط³طھط : 4
ط³طھظ : 2
ط³ظ : 10
ط³غ : 1
طµ : 1
طµط : 1
طµظ : 6
ط¹ط : 1
ط¹ظ : 8
ظ : 291
ع : 54
غ : 95
ï : 1

Answer 1

此link解决了我的问题。:)对于便携式解决方案，请检查此link。

这是完美无缺的最终代码:)：

#include "stdafx.h"
#include <iostream>
#include <regex>
#include <fstream>
#include <string>
#include <map>
#include <fcntl.h> // for _wfopen_s
#include <io.h> //for _setmode


using namespace std;

int main()
{
    string path = "";    

    map<wstring, int> container;

     FILE* fp;
    _wfopen_s (&fp, L"ftest.txt", L"r");
    _setmode (_fileno (fp), _O_U8TEXT);

    wifstream file(fp);
    wregex reg(L"\\w+");

    wstring s = L"";

    while (file.good())
    {
        getline(file,s);    
        for ( wsregex_iterator it (s.begin(), s.end(), reg), it_end ; it != it_end ; ++it)
        {
            container[(wstring)(*it)[0]]++ ;
        }
    }

    cout <<"\nDone..."<< endl;

    fclose(fp);

    _wfopen_s (&fp, L"list.txt", L"w");
    _setmode (_fileno (fp), _O_U8TEXT);
    wofstream output(fp);

    for (auto item : container)
    {
        wcout<<item.first <<" : "<<item.second <<endl;
        //write output to list.txt
        output<<item.first <<" : "<<item.second <<endl;
    }
    fclose(fp);
    system("pause");
    return 0;
}

Answer 2

您需要将文件的UTF8编码转换为std::wregex使用的UTF16编码。

使用C ++ 11，您可以使用std::codecvt_utf8_utf16：

来完成

std::wifstream file("ftest.txt"); 
file.imbue(std::locale(file.getloc(), new std::codecvt_utf8_utf16<wchar_t>());
// "file" will now read UTF8 and output UTF16.

Pre C ++ 11，您可以使用boost::locale转换：

e.g。

auto w_s = boost::locale::utf_to_utf<char>(s);

使用wregex创建垃圾输出？

2 个答案: