我在整个UTF-8高棉语(柬埔寨语)文本文件中使用ICU BreakIterator时遇到问题,以便打断换行符(高棉语在泰语之间没有空格)。
我使用了一个给我的样本,并将其修改为逐行读取文本文件,但问题是如果该行只有一个单词,则BreakIterator不能正常工作,因为我们将其配置为尝试连续找到至少3个单词(对于高棉人来说这是必要的,没有它,BreakIterator就不那么准确了。)
有人可以帮我理解如何克服这个问题吗?我认为最简单的方法是将整个文本文件读入缓冲区,但我似乎无法让它正常工作。
以下是我目前使用的整个代码,它逐行打破文本文件中的单词:
/*
Written by George Rhoten, and SBBIC to test how word segmentation works.
Code inspired by the break ICU sample.
Here is an example to run this code in Ubuntu.
./a.out input.txt output.txt
Encode input.txt as UTF-8.
The output text is UTF-8.
*/
#include <string>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <unicode/brkiter.h>
#include <unicode/ucnv.h>
#include <stdlib.h>
#define ZW_SPACE "\xE2\x80\x8B"
void printUnicodeString(const UnicodeString &s) {
int32_t len = s.length() * U8_MAX_LENGTH + 1;
char *charBuf = new char[len];
len = s.extract(0, s.length(), charBuf, len, NULL);
charBuf[len] = 0;
printf("%s", charBuf);
delete charBuf;
}
int main(int argc, char **argv) {
//Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)
//Cannot find the input file you specified ("$input"). Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)
//Cannot write to output file. Please check folder permissions.
std::ifstream input(argv[1]);
//std::ifstream input("read.txt");
std::string line;
std::ofstream o(argv[2]);
//std::ofstream o("output.txt");
//If input file cannot be found ERROR
if (!input) {
std::cout<<"Cannot find the input file you specified ("<<argv[1]<<").\nPlease provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
goto stop;
}
//If output file cannot be created ERROR
if (!o) {
std::cout<<"Cannot write to output file ("<<argv[2]<<").\nPlease check folder permissions."<<std::endl;
goto stop;
}
//If no input file is given on command line ERROR
if(argv[1]==0) {
std::cout<<"Please provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
goto stop;
}
//If no output file is given on command line ERROR
if(argv[2]==0) {
std::cout<<"Please provide output file name as well as an input file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
goto stop;
}
while(std::getline(input,line)) {
//Convert standard string to icu UnicodeString
UnicodeString Nathan = UnicodeString::fromUTF8(StringPiece(line.c_str()));
/* Creating and using text boundaries */
ucnv_setDefaultName("UTF-8");
UnicodeString stringToExamine(Nathan);
if (argc > 1) {
// Override the default charset.
stringToExamine = UnicodeString(Nathan);
if (stringToExamine.charAt(0) == 0xFEFF) {
// Remove the BOM
stringToExamine = UnicodeString(stringToExamine, 1);
}
}
//printUnicodeString(stringToExamine);
//puts("");
//print each sentence in forward and reverse order
UErrorCode status = U_ZERO_ERROR;
BreakIterator* boundary = BreakIterator::createLineInstance(NULL, status);
if (U_FAILURE(status)) {
printf("Failed to create sentence break iterator. status = %s",
u_errorName(status));
exit(1);
}
//print each word in order
boundary->setText(stringToExamine);
int32_t start = boundary->first();
int32_t end = boundary->next();
while (end != BreakIterator::DONE) {
if (start != 0) {
printf(ZW_SPACE);
//output ZWSpace to output file(output.txt)
o << ZW_SPACE;
//filestr<<ZW_SPACE;
//filestr.close();
}
//Set variable NathanOut to current word and print to console
UnicodeString NathanOut = UnicodeString(stringToExamine, start, end-start);
printUnicodeString(NathanOut);
//Convert UnicodeString to normal string
std::string cs;
NathanOut.toUTF8String(cs);
//Output the string to file(output.txt)
o << cs;
//print output to console
printUnicodeString(NathanOut);
start = end;
end = boundary->next();
}
delete boundary;
}//end of while
stop:
return 0;
}