我正在讨论这段代码的问题,我已经编写了这些代码来清理并将文本输入到100-140个字符行,同时尊重字边界。此脚本旨在按字或字符进行解析,因此我所指的函数是wordQueue。所以它基本上需要输入文本,在换行符上拆分,与空格连接并在空格上分割以给出“单词”数组。当while循环移出一个单词时,如果chunk + word小于最大块长度,则将其添加到当前chunk。如果它超过最大长度,则将当前块推入输出数组,重置块并重新开始。每个循环时间它还检查剩余要处理的剩余单词是否小于由字符串连接时的最小块长度。它打破了循环并使用传递的完成回调写入文件。
这不是发生的事情。在脚本结束之前,它可能会通过50个块。根本没有调用完成的回调。我可以看到阵列中还有44k +字还要处理。我必须错过一些不合适的逻辑。我会说这段代码正在运行,但这恰好是我在回购中没有的脚本之一,现在做了一些改动它没有用。
var
fs = require('fs'),
async = require('async'),
_ = require('underscore');
var
args = process.argv.splice(2),
scriptInput,
scriptOutput = [],
csvFileName = args[0],
config = {
splitBy: args[1] || 'word',
};
var re = {
NL: /\n/,
LR: /\r/,
allTabs: /\t/g,
multipleSpaces: /\s{2,}/,
whiteSpace: /\s/g,
leadingWhitespace: /^\s*/,
trailingWhitespace: /\s$/,
byCharBlacklist: /[a-zA-Z0-9!!@#%=\--—_??.。,、,::;"'“”|\$\^&*\(\)(){}\\\/\[\]<>°…†]/g,
byCharLineSplit: /.{1,52}/g
}
var
cleanInput,
wordQueue,
charQueue,
workQueue;
cleanInput = function (text) {
return text
// Normalize new lines
.replace(re.LR, '')
// Trim
.replace(re.allTabs, ' ')
.replace(re.multipleSpaces, ' ')
.replace(re.leadingWhitespace, '')
.replace(re.trailingWhitespace, '');
};
wordQueue = function (input, done) {
var output = [], currentChunk = [], queue = input.split("\n").join(' ').split(' ');
// While we have and words left to processed
while (nextSegment = queue.shift()) {
// If the currentChunk + current word will remain under max length, append to currentChunk
if (currentChunk.join(' ').length + nextSegment.length <= config.maxChunkSize) {
currentChunk.push(nextSegment);
console.log("Gluing: %s", nextSegment);
console.log("Words left: ", queue.length);
console.log("\n");
// If not, push the current currentChunk and reset for next cycle
} else {
output.push(currentChunk.join(' '));
currentChunk = [];
console.log('Pushing chunk "%s" in to output', currentChunk.join(' '));
console.log("\n");
}
// If the remaining segments can't be glued to min length, break loop.
if (queue.join(' ').length < config.minChunkSize) {
console.log('End of data.');
console.log('Writing output to file...');
done(output.join("\n"));
break;
}
}
};
charQueue = function (input, done) {
var lines;
input = input
.split("\n")
.join('')
.replace(re.byCharBlacklist, '')
.replace(re.whiteSpace, '');
lines = input.match(re.byCharLineSplit);
lines = _.map(lines, function (line) {
return line.split('').join(' ');
});
done(lines.join("\n"));
};
/// RUN
config.minChunkSize = 100;
config.maxChunkSize = (config.splitBy == 'word') ? 140 : 105;
scriptInput = cleanInput(fs.readFileSync(csvFileName, 'utf8'));
workQueue = (config.splitBy == 'word') ? wordQueue : charQueue;
workQueue(scriptInput, function (output) {
fs.writeFile(csvFileName+'.faster.csv', output, {encoding: 'utf16le'}, function (err) {
if (err) throw err;
console.log('File write complete: %s', csvFileName+'.faster.csv');
});
});
答案 0 :(得分:1)
这是一个循环的危险条件:nextSegment = queue.shift()
您期望值undefined
结束循环,但请注意,空字符串也将计算为false。相反,我建议使用:
while ((nextSegment = queue.shift()) !== undefined) {