我正在尝试从文件夹中读取多个文本文件,并存储每个单词的开始位置。我正在使用Boost来清除标点符号中的文字。
当单词具有特殊字符(例如(Õ,Ø,æ等))时遇到问题。 在这种情况下,我收到错误消息:“表达式:(无符号)(c + 1)< = 256”。
以下是我提到的应用程序的代码:
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include<iterator>
#include<string>
#include "/../dirent.h/dirent.h"
#include <boost/tokenizer.hpp>
using namespace std;
using namespace boost;
int main() {
DIR* dir;
dirent* pdir;
dir = opendir("D:/../dataset/");
int number_of_words=0;
int text_length = 30;
char filename[300];
int i=0;
while (pdir = readdir(dir))
{
string fileString;
cout<<"-------------------------------------------"<<endl;
cout<<"Name of text file: "<<pdir->d_name << endl;
strcpy(filename, "D:/.../dataset/");
strcat(filename, pdir->d_name);
ifstream file(filename);
std::istream_iterator<std::string> beg(file), end;
number_of_words = distance(beg,end);
//cout<<"Number of words in file: "<<number_of_words<<endl;
ifstream files(filename);
//char output[200];
if (file.is_open())
{
string output;
while (!files.eof())
{
files >> output;
fileString += " ";
fileString += output;
//cout<<output<<endl;
}
//cout<<fileString<<endl;
cout<<"Number of characters: "<<fileString.size()<<endl;
cout<<"-------------------------------------------"<<endl;
string fileStringTokenized;
tokenizer<>tok (fileString);
int indice_cuvant_curent = 0;
int index = 0;
vector<int> myvector;
for(tokenizer<>::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
{
string currentWord;
currentWord = *beg;
myvector.push_back(index);
index+=currentWord.size();
//cout<<index<<"\t";
//cout<<*beg<<endl;
fileStringTokenized += *beg;
}
}
file.close();
}
closedir(dir);
return 0;
}
为什么会出现此问题,如何解决?
答案 0 :(得分:1)
这样的事情应该有效:
#include <iostream>
#include <string>
#include <vector>
#include <boost/tokenizer.hpp>
using String = std::wstring;
using Tokenizer = boost::tokenizer< boost::char_delimiters_separator<String::value_type>, String::const_iterator, String>;
int main()
{
String str(L"Õ, Ø, æ");
Tokenizer tok (str);
for(Tokenizer::iterator beg=tok.begin(); beg!=tok.end(); ++beg)
{
std::wcout << (*beg) << L'\n';
}
}
它使用标记器来表示宽字符。
答案 1 :(得分:-2)
使用UTF-16字符串,它将帮助您解决问题