Question

我在整个UTF-8高棉语（柬埔寨语）文本文件中使用ICU BreakIterator时遇到问题，以便打断换行符（高棉语在泰语之间没有空格）。

我使用了一个给我的样本，并将其修改为逐行读取文本文件，但问题是如果该行只有一个单词，则BreakIterator不能正常工作，因为我们将其配置为尝试连续找到至少3个单词（对于高棉人来说这是必要的，没有它，BreakIterator就不那么准确了。）

有人可以帮我理解如何克服这个问题吗？我认为最简单的方法是将整个文本文件读入缓冲区，但我似乎无法让它正常工作。

以下是我目前使用的整个代码，它逐行打破文本文件中的单词：

/*
Written by George Rhoten, and SBBIC to test how word segmentation works.
Code inspired by the break ICU sample.

Here is an example to run this code in Ubuntu.

./a.out input.txt output.txt

Encode input.txt as UTF-8.
The output text is UTF-8.
*/

#include <string>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <unicode/brkiter.h>
#include <unicode/ucnv.h>
#include <stdlib.h>

#define ZW_SPACE "\xE2\x80\x8B"

void printUnicodeString(const UnicodeString &s) {
    int32_t len = s.length() * U8_MAX_LENGTH + 1;
    char *charBuf = new char[len];
    len = s.extract(0, s.length(), charBuf, len, NULL);
    charBuf[len] = 0;
    printf("%s", charBuf);
    delete charBuf;
}

int main(int argc, char **argv) {   


    //Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)

    //Cannot find the input file you specified ("$input"). Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)

    //Cannot write to output file.  Please check folder permissions.

    std::ifstream input(argv[1]);
    //std::ifstream input("read.txt");
    std::string line;
    std::ofstream o(argv[2]);    
    //std::ofstream o("output.txt");


    //If input file cannot be found ERROR
    if (!input) {
        std::cout<<"Cannot find the input file you specified ("<<argv[1]<<").\nPlease provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;  
    }
    //If output file cannot be created ERROR
    if (!o) {
        std::cout<<"Cannot write to output file ("<<argv[2]<<").\nPlease check folder permissions."<<std::endl;
        goto stop;  

    }
    //If no input file is given on command line ERROR
    if(argv[1]==0) {
        std::cout<<"Please provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;
    }
    //If no output file is given on command line ERROR
    if(argv[2]==0) {
        std::cout<<"Please provide output file name as well as an input file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;
    }


    while(std::getline(input,line)) {


//Convert standard string to icu UnicodeString

    UnicodeString Nathan = UnicodeString::fromUTF8(StringPiece(line.c_str()));


/* Creating and using text boundaries */
    ucnv_setDefaultName("UTF-8");
    UnicodeString stringToExamine(Nathan);
    if (argc > 1) {
        // Override the default charset.
        stringToExamine = UnicodeString(Nathan);
        if (stringToExamine.charAt(0) == 0xFEFF) {
            // Remove the BOM
            stringToExamine = UnicodeString(stringToExamine, 1);
        }
    }
    //printUnicodeString(stringToExamine);
    //puts("");

    //print each sentence in forward and reverse order
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator* boundary = BreakIterator::createLineInstance(NULL, status);
    if (U_FAILURE(status)) {
        printf("Failed to create sentence break iterator. status = %s", 
            u_errorName(status));
        exit(1);
    }

    //print each word in order
    boundary->setText(stringToExamine);
    int32_t start = boundary->first();
    int32_t end = boundary->next();
    while (end != BreakIterator::DONE) {
        if (start != 0) {
            printf(ZW_SPACE);

         //output ZWSpace to output file(output.txt)
            o << ZW_SPACE;


            //filestr<<ZW_SPACE;
            //filestr.close();



        }
        //Set variable NathanOut to current word and print to console
        UnicodeString NathanOut = UnicodeString(stringToExamine, start, end-start);
        printUnicodeString(NathanOut);



        //Convert UnicodeString to normal string
            std::string cs;
            NathanOut.toUTF8String(cs);
     //Output the string to file(output.txt)
            o << cs;


            //print output to console
            printUnicodeString(NathanOut);


        start = end;
        end = boundary->next();
    }

    delete boundary;



    }//end of while

    stop:
    return 0;
}

如何在C ++中将ICU BreakIterator与Unicode文本文件一起使用

0 个答案: