如何在C ++中将ICU BreakIterator与Unicode文本文件一起使用

我在整个UTF-8高棉语(柬埔寨语)文本文件中使用ICU BreakIterator时遇到问题,以便打断换行符(高棉语在泰语之间没有空格)。




Written by George Rhoten, and SBBIC to test how word segmentation works.
Code inspired by the break ICU sample.

Here is an example to run this code in Ubuntu.

./a.out input.txt output.txt

Encode input.txt as UTF-8.
The output text is UTF-8.

#include <string>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <unicode/brkiter.h>
#include <unicode/ucnv.h>
#include <stdlib.h>

#define ZW_SPACE "\xE2\x80\x8B"

void printUnicodeString(const UnicodeString &s) {
    int32_t len = s.length() * U8_MAX_LENGTH + 1;
    char *charBuf = new char[len];
    len = s.extract(0, s.length(), charBuf, len, NULL);
    charBuf[len] = 0;
    printf("%s", charBuf);
    delete charBuf;

int main(int argc, char **argv) {   

    //Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)

    //Cannot find the input file you specified ("$input"). Please provide an input file name as well as an output file name (ex. ./a.out input.txt output.txt)

    //Cannot write to output file.  Please check folder permissions.

    std::ifstream input(argv[1]);
    //std::ifstream input("read.txt");
    std::string line;
    std::ofstream o(argv[2]);    
    //std::ofstream o("output.txt");

    //If input file cannot be found ERROR
    if (!input) {
        std::cout<<"Cannot find the input file you specified ("<<argv[1]<<").\nPlease provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;  
    //If output file cannot be created ERROR
    if (!o) {
        std::cout<<"Cannot write to output file ("<<argv[2]<<").\nPlease check folder permissions."<<std::endl;
        goto stop;  

    //If no input file is given on command line ERROR
    if(argv[1]==0) {
        std::cout<<"Please provide an input file name as well as an output file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;
    //If no output file is given on command line ERROR
    if(argv[2]==0) {
        std::cout<<"Please provide output file name as well as an input file name\n(example: ./a.out input.txt output.txt)"<<std::endl;
        goto stop;

    while(std::getline(input,line)) {

//Convert standard string to icu UnicodeString

    UnicodeString Nathan = UnicodeString::fromUTF8(StringPiece(line.c_str()));

/* Creating and using text boundaries */
    UnicodeString stringToExamine(Nathan);
    if (argc > 1) {
        // Override the default charset.
        stringToExamine = UnicodeString(Nathan);
        if (stringToExamine.charAt(0) == 0xFEFF) {
            // Remove the BOM
            stringToExamine = UnicodeString(stringToExamine, 1);

    //print each sentence in forward and reverse order
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator* boundary = BreakIterator::createLineInstance(NULL, status);
    if (U_FAILURE(status)) {
        printf("Failed to create sentence break iterator. status = %s", 

    //print each word in order
    int32_t start = boundary->first();
    int32_t end = boundary->next();
    while (end != BreakIterator::DONE) {
        if (start != 0) {

         //output ZWSpace to output file(output.txt)
            o << ZW_SPACE;


        //Set variable NathanOut to current word and print to console
        UnicodeString NathanOut = UnicodeString(stringToExamine, start, end-start);

        //Convert UnicodeString to normal string
            std::string cs;
     //Output the string to file(output.txt)
            o << cs;

            //print output to console

        start = end;
        end = boundary->next();

    delete boundary;

    }//end of while

    return 0;

