我正在尝试使用Node.js流将数百万个字符串写入文件中,但是在此过程中,RAM使用量高达800MB:
const fs = require('fs')
const walkdir = require('walkdir')
let options = {
"max_depth": 0,
"track_inodes": true
}
let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')
paths.on('path', function(path, stat) {
wstream.write(`"${path}",`)
})
paths.on('end', function(path, stat) {
wstream.write(']')
wstream.end()
// Compressing the file after it's written:
const gzip = require('zlib').createGzip()
const inp = fs.createReadStream('C:/test/file.txt')
const out = fs.createWriteStream('C:/test/file.txt.gz')
inp.pipe(gzip).pipe(out)
})
我还尝试过这样写文件:
...
paths.on('path', function(path, stat) {
fs.writeFileSync('C:/test/file.txt', path)
})
...
我还尝试了sync
:
walkdir.sync(dir, options, callback)
function callback(path) {
let res = wstream.write(`"${path}",`)
if (!res) {
wstream.once('drain', callback)
}
else {
callback()
}
}
但是这两种方法都产生相同的结果,RAM占用量高达500-800MB
我也尝试了以下方法,RAM的使用率始终保持在〜100MB左右,但实际上并不起作用,它将412kb写入文件,然后继续利用CPU,但没有任何反应(其他方法完成了对文件的写入) 1-2分钟之内)
const readdirp = require('readdirp');
const { Transform } = require('stream');
const entryInfoStream = readdirp({
root: dir
});
entryInfoStream
.pipe(new Transform({
objectMode: true,
transform(entryInfo, encoding, callback) {
this.push(entryInfo.path);
callback();
},
}))
.pipe(wstream);
如何确保流按预期工作(内存使用率低)?
在写入过程中如何压缩(gzip)文件?还是只能在写完之后才能做?
答案 0 :(得分:2)
您可以在没有任何外部依赖性的情况下实现整个逻辑,以查看在何处进行优化。下面是您可以调整的最小实现:
const fs = require('fs');
const path = require('path');
const zlib = require('zlib');
const stream = require('stream');
// Recursive walk file system
function walk(dir, str, busy) {
busy.inc();
fs.readdir(dir, (e, c) => {
if (!e) {
c.forEach(f => {
const p = path.join(dir, f);
busy.inc();
fs.stat(p, (e, s) => {
if (!e && s.isDirectory()) {
walk(p, str, busy);
}
str.write(p + "\n");
busy.dec();
});
});
}
busy.dec();
});
}
// Scan FS and write to file
async function scan(dir, dest) {
return new Promise((resolve) => {
const gzStr = zlib.createGzip();
const destStr = fs.createWriteStream(dest);
let count = 0;
const busy = {
inc: () => count++,
dec: () => {
count--;
if (count < 1) {
process.nextTick(() => {
gzStr.end();
gzStr.once('finish', resolve);
});
}
}
};
walk(dir, gzStr, busy, resolve);
gzStr.pipe(destStr);
});
}
// Test above code
(async () => {
// Save gzipped
await scan(__dirname, './files.txt.gz');
// Gunip to verify
const unzipped = fs.createWriteStream('./files.txt');
fs.createReadStream('./files.txt.gz').pipe(zlib.createGunzip()).pipe(unzipped);
// End
unzipped.on('close', () => console.log('done'));
})();
答案 1 :(得分:0)
这是因为您的异步操作没有任何限制。每个路径都将为paths.on('path', ...)
创建一个新事件,因此所有路径被加载到事件循环中的速度都比处理它们要快得多,因此会导致内存高峰。您需要限制一次写入的路径数量。
您可以使用walkdir.sync
来限制它,但这意味着您一次只能处理一个路径。另外,根据实现方式的不同,最终发现路径的速度可能比写入流的速度还快。
一种更灵活的解决方案是跟踪正在处理的并发路径,并在达到极限后暂停流。
const fs = require('fs')
const walkdir = require('walkdir')
let options = {
"max_depth": 0,
"track_inodes": true
}
let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')
const maxPaths = 20; // Maximum amount of concurrent paths allowed to process
let currentPaths = 0; // Current amount of concurrent paths being processed
let deferredPaths = []; // If we somehow exceed the limit, store the excess paths here for later processing. This might not be necessary, depending on how walkdir implements their pause function
const finishPathFlush = () => {
if (deferredPaths.length > 0) {
// Process any paths in the deferred queue
wstream.write('"' + deferredPaths.pop() + '",', finishPathFlush);
} else {
// No more work to do, resume walkdir
--currentPaths;
paths.resume();
}
}
paths.on('path', function(path, stat) {
if (currentPaths < maxPaths) {
// We have room to process this path
if (++currentPaths === maxPaths) {
// If we reach the limit pause walkdir
paths.pause();
}
wstream.write(`"${path}",`, finishPathFlush)
} else {
// Got too many paths, defer this path
deferredPaths.push(path);
}
})
paths.on('end', function(path, stat) {
wstream.write(']')
wstream.end()
// Compressing the file after it's written:
const gzip = require('zlib').createGzip()
const inp = fs.createReadStream('C:/test/file.txt')
const out = fs.createWriteStream('C:/test/file.txt.gz')
inp.pipe(gzip).pipe(out)
})