Question

我正在尝试从文件夹中读取多个文本文件，并存储每个单词的开始位置。我正在使用Boost来清除标点符号中的文字。

当单词具有特殊字符（例如（Õ，Ø，æ等））时遇到问题。在这种情况下，我收到错误消息：“表达式:(无符号）（c + 1）＆lt; = 256”。

以下是我提到的应用程序的代码：

#include "stdafx.h"
#include <iostream>
#include <fstream>
#include<iterator>
#include<string>
#include "/../dirent.h/dirent.h"
#include <boost/tokenizer.hpp>

using namespace std;
using namespace boost;

int main() {

    DIR*     dir;
    dirent*  pdir;

    dir = opendir("D:/../dataset/"); 

    int number_of_words=0;
    int text_length = 30;
    char filename[300];
    int i=0;
    while (pdir = readdir(dir)) 
    {
        string fileString;

        cout<<"-------------------------------------------"<<endl;
        cout<<"Name of text file: "<<pdir->d_name << endl;
        strcpy(filename, "D:/.../dataset/");
        strcat(filename, pdir->d_name);
        ifstream file(filename);
        std::istream_iterator<std::string> beg(file), end;

        number_of_words = distance(beg,end);

        //cout<<"Number of words in file: "<<number_of_words<<endl;
        ifstream files(filename);
         //char output[200];

         if (file.is_open()) 
         {

             string output;

             while (!files.eof())
             {

                    files >> output;
                    fileString += " ";
                    fileString += output;
                    //cout<<output<<endl;

             }
             //cout<<fileString<<endl;
             cout<<"Number of characters: "<<fileString.size()<<endl;
             cout<<"-------------------------------------------"<<endl;


            string fileStringTokenized;
            tokenizer<>tok (fileString);

            int indice_cuvant_curent = 0;
            int index = 0;
            vector<int> myvector;

            for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
            {
                string currentWord;
                currentWord = *beg;

                myvector.push_back(index);
                index+=currentWord.size();
                //cout<<index<<"\t";

                //cout<<*beg<<endl;
                fileStringTokenized += *beg;
            }

         }
         file.close();
    }
    closedir(dir);
    return 0;
}

为什么会出现此问题，如何解决？

Answer 1

这样的事情应该有效：

#include <iostream>
#include <string>
#include <vector>
#include <boost/tokenizer.hpp>

using String = std::wstring;
using Tokenizer = boost::tokenizer< boost::char_delimiters_separator<String::value_type>, String::const_iterator, String>;
int main()
{
    String str(L"Õ, Ø, æ");
    Tokenizer tok (str);

    for(Tokenizer::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
    {
        std::wcout << (*beg) << L'\n';
    }
}

它使用标记器来表示宽字符。

Answer 2

使用UTF-16字符串，它将帮助您解决问题

如何处理C ++程序中的特殊字符？

2 个答案: