用高/低代理计算java中的单词和字符?

时间:2015-10-20 09:30:58

标签: java string count utf-16

我知道有关于这个主题的一些SO,但所提出的所有解决方案似乎都采用了与我在javascript中看到的示例不同的方法。

这是一个javascript示例,用于计算在文本字符串中键入的段落,句子单词和字符,其中包括检查高/低代理项以专门计算字符:

javascript版

count(text);

function count(original) {
    var trimmed = original.replace(/[\u200B]+/, '').trim();
    return {
        paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
        sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
        words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
        characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
        all: _decode(original).length
    };
};

function _decode(string) {
    var output = [],
        counter = 0,
        length = string.length,
        value, extra;
    while (counter < length) {
        value = string.charCodeAt(counter++);
        if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
            // High surrogate, and there is a next character.
            extra = string.charCodeAt(counter++);
            if ((extra & 0xFC00) === 0xDC00) {
                // Low surrogate.
                output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
            } else {
                // unmatched surrogate; only append this code unit, in case the next
                // code unit is the high surrogate of a surrogate pair
                output.push(value, extra);
                counter--;
            }
        } else {
            output.push(value);
        }
    }
    return output;
}

以下和jsfiddle

中的演示

var text = 'This is a paragraph. This is the 2nd sentence in the 1st paragraph.\nThis is another paragraph.';
var count = doCount(text);

document.body.innerHTML = '<pre>' + text + '</pre><hr>';
for (i in count) {
	document.body.innerHTML += '<p>'+ i +': ' + count[i] + '</p>';
}

/* COUNTING LIBRARY */

/**
 * Extracted from https://github.com/RadLikeWhoa/Countable/, which in 
 * turn uses `ucs2decode` function from the punycode.js library.
 */
function doCount(original) {
    var trimmed = original.replace(/[\u200B]+/, '').trim();

    return {
        paragraphs: trimmed ? (trimmed.match(/\n+/g) || []).length + 1 : 0,
        sentences: trimmed ? (trimmed.match(/[.?!…\n]+./g) || []).length + 1 : 0,
        words: trimmed ? (trimmed.replace(/['";:,.?¿\-!¡]+/g, '').match(/\S+/g) || []).length : 0,
        characters: trimmed ? _decode(trimmed.replace(/\s/g, '')).length : 0,
        all: _decode(original).length
    };
};

/**
 * `ucs2decode` function from the punycode.js library.
 *
 * Creates an array containing the decimal code points of each Unicode
 * character in the string. While JavaScript uses UCS-2 internally, this
 * function will convert a pair of surrogate halves (each of which UCS-2
 * exposes as separate characters) into a single code point, matching
 * UTF-16.
 *
 * @see     <http://goo.gl/8M09r>
 * @see     <http://goo.gl/u4UUC>
 *
 * @param   {String}  string   The Unicode input string (UCS-2).
 *
 * @return  {Array}   The new array of code points.
 */
function _decode(string) {
    var output = [],
        counter = 0,
        length = string.length,
        value, extra;

    while (counter < length) {
        value = string.charCodeAt(counter++);

        if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
            // High surrogate, and there is a next character.
            extra = string.charCodeAt(counter++);

            if ((extra & 0xFC00) === 0xDC00) {
                // Low surrogate.
                output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
            } else {
                // unmatched surrogate; only append this code unit, in case the next
                // code unit is the high surrogate of a surrogate pair
                output.push(value, extra);
                counter--;
            }
        } else {
            output.push(value);
        }
    }

    return output;
}

我不熟悉字符编码方案和高/低代理项,但是在使用java计算时不需要这样做吗?

我对javascript实现的结果感到满意,我想对我的java后端进行计数,但我不确定是否需要相同的方法或者应该如何完成。

1 个答案:

答案 0 :(得分:0)

所以javascript版本的作用是将代理对读作一个字符,如果它们出现在正在解码的文本中。这在Javascript中是可能的,因为取决于Javascript引擎both UCS-2 and UTF-16是允许的,UTF-16支持高代理,这意味着使用代码点编码单个可见字符。为了正确计算长度,图书馆会考虑额外的代码点,因此它们被计为一个。

在Java中你有类似的问题,除了在Java中你可以有更多的编码方案。幸运的是,Java已经为包含高代理项的String返回了正确的长度。但是,如果您想要分离组合代码点甚至删除它们,Java会从文本中提供Normalizerexample of removing diacritics)。

  

string = Normalizer.normalize(string, Normalizer.Form.NFD);