我有这段代码将较长的行分成等长字符串的数组,保留单词,同时考虑格式如[[u;#fff;]some text]
,它分割文本,以便每个字符串可以独立转换为html:
var format_re = /\[\[([!gbiuso]*;[^;\]]*;[^;\]]*(?:;|[^\]()]*);?[^\]]*)\]([^\]]*\\\][^\]]*|[^\]]*|[^\[]*\[[^\]]*)\]?/gi;
var format_begin_re = /(\[\[[!gbiuso]*;[^;]*;[^\]]*\])/i;
var format_last_re = /\[\[[!gbiuso]*;[^;]*;[^\]]*\]?$/i;
$.terminal.split_equal = function(str, length, words) {
var formatting = false;
var in_text = false;
var prev_format = '';
var result = [];
// add format text as 5th paramter to formatting it's used for
// data attribute in format function
var array = str.replace(format_re, function(_, format, text) {
var semicolons = format.match(/;/g).length;
// missing semicolons
if (semicolons == 2) {
semicolons = ';;';
} else if (semicolons == 3) {
semicolons = ';';
} else {
semicolons = '';
}
// return '[[' + format + ']' + text + ']';
// closing braket will break formatting so we need to escape
// those using html entity equvalent
return '[[' + format + semicolons +
text.replace(/\\\]/g, ']').replace(/\n/g, '\\n') + ']' +
text + ']';
}).split(/\n/g);
for (var i = 0, len = array.length; i < len; ++i) {
if (array[i] === '') {
result.push('');
continue;
}
var line = array[i];
var first_index = 0;
var count = 0;
var space = -1;
for (var j=0, jlen=line.length; j<jlen; ++j) {
if (line[j] === '[' && line[j+1] === '[') {
formatting = true;
} else if (formatting && line[j] === ']') {
if (in_text) {
formatting = false;
in_text = false;
} else {
in_text = true;
}
} else if ((formatting && in_text) || !formatting) {
if (line[j] === '&') { // treat entity as one character
var m = line.substring(j).match(/^(&[^;]+;)/);
if (!m) {
// should never happen if used by terminal,
// because it always calls $.terminal.encode
// before this function
throw new Error("Unclosed html entity in line " +
(i+1) + ' at char ' + (j+1));
}
j+=m[1].length-2; // because continue adds 1 to j
// if entity is at the end there is no next loop
// issue #77
if (j === jlen-1) {
result.push(output + m[1]);
}
continue;
} else if (line[j] === ']' && line[j-1] === '\\') {
// escape \] counts as one character
--count;
} else {
++count;
}
}
function is_space() {
return line.substring(j-6, j) == ' ' ||
line.substring(j-1, j) == ' ';
}
if (is_space() && ((formatting && in_text) || !formatting)) {
space = j;
}
if ((count === length || j === jlen-1) &&
((formatting && in_text) || !formatting)) {
var output;
var after = line.substring(space, j+length+1);
var text = $('<span>' + after + '</span>').text();
var can_break = text.match(/\s/);
if (words && space != -1 && j !== jlen-1 && can_break) {
// get text to last space
output = line.substring(first_index, space);
j = space-1;
space = -1;
} else {
output = line.substring(first_index, j+1);
}
if (words) {
output = output.replace(/^( |\s)+|( |\s)+$/g, '');
}
first_index = j+1;
count = 0;
if (prev_format) {
output = prev_format + output;
if (output.match(']')) {
prev_format = '';
}
}
// Fix output if formatting not closed
var matched = output.match(format_re);
if (matched) {
var last = matched[matched.length-1];
if (last[last.length-1] !== ']') {
prev_format = last.match(format_begin_re)[1];
output += ']';
} else if (output.match(format_last_re)) {
var line_len = output.length;
// why this line ???
//var f_len = line_len-last[last.length-1].length;
output = output.replace(format_last_re, '');
prev_format = last.match(format_begin_re)[1];
}
}
result.push(output);
}
}
}
return result;
};
它的工作方式几乎正确,但有些线路应该更短:
is cracker.The term
在此FIDDLE中,当您删除格式,检查复选框时,它可以正常工作。我在这方面工作了几个小时,并且不知道为什么这条线更短,任何帮助都将非常感激。
答案 0 :(得分:4)
我认为我使用更简单的方法解决了这个问题。首先分解所有单词,然后在跟踪当前格式的同时重新组合行。请参阅JsFiddle。
<强>的JavaScript 强>
$.terminal.split_equal = function(str, length, words) {
var result = [],
currentFormat = null,
currentLine = '',
currentLineLengthWithoutFormatting = 0;
// 1. Split words on
words = str.split(/ /g);
// 2. Re-assemble lines while keeping track of current formats
words.forEach(function(word) {
// Keep track of current format
var format = word.match(/^\[\[([^\]]+)\]/g),
wordWithFormatting, wordLength;
if (format !== null && format[0]) {
currentFormat = format[0];
word = word.slice(format[0].length);
}
// Apply current format to each word separatly
wordLength = word.length;
wordWithFormatting = (currentFormat || '') + word;
if (currentFormat) {
if (word.indexOf(']') !== -1) {
wordLength--;
currentFormat = null;
} else {
wordWithFormatting += ']';
}
}
// Assemble line
if (currentLineLengthWithoutFormatting + wordLength <= length) {
// Word still fits on current line
if (currentLineLengthWithoutFormatting > 0) {
currentLine += ' ';
currentLineLengthWithoutFormatting++;
}
} else {
// Need to start new line
result.push(currentLine);
currentLine = '';
currentLineLengthWithoutFormatting = 0;
}
currentLine += wordWithFormatting;
currentLineLengthWithoutFormatting += wordLength;
});
if (currentLineLengthWithoutFormatting > 0)
result.push(currentLine);
return result;
};
答案 1 :(得分:4)
以下是修复原始代码的方法:
在第40行之后添加以下内容:
in_text = false;
代码使用in_text
标志来确定当前位置是否是常规文本。但是,当它进入格式化标记区域时,它没有清除标记。这是超短线问题中描述的主要问题的原因。
将第76/77行的if语句更改为:
if (is_space() && ((formatting && in_text) || !formatting || (line[j] === '[' && line[j+1] === '['))) {
这解决了一个较小的问题,即常规文本和格式化文本之间的空格没有发生换行。
在这里工作小提琴:https://jsfiddle.net/2w10xp3m/1/
答案 2 :(得分:0)
npm软件包paragraph-builder将连续文本拆分成均匀分布的所谓段落,并且所有段落的字数大致相同。段落的概念似乎就是您要搜索的内容。
您可以定义段落的单词数。您可以将段落的原理扩展到页面,考虑到页面平均具有大约相同数量的字符,包括空格。
此段落构建器节点脚本从连续文本生成段落。它输出一个文本,其中每个段落的大小大致相同,从而在文本中提供均匀的段落分布。它不会将数字拆分为“ 1.2”之类的数字。
有一个选项可以定义段落之间的分隔符,或者您可以将段落提取到一个字符串数组中,并从中应用html标签<p>
。检查其文档以进一步澄清。