节点fs.readdir在文件太多的文件夹中冻结

时间:2018-11-02 20:44:43

标签: javascript node.js fs

在Node.js中,我必须读取文件夹中的文件,并且对于每个文件都获得文件处理程序信息,这是我最简单的使用fs.readdir的实现:

FileServer.prototype.listLocal = function (params) {
            var self = this;
            var options = {
                limit: 100,
                desc: 1
            };
            // override defaults
            for (var attrname in params) { options[attrname] = params[attrname]; }

            // media path is the media folder
            var mediaDir = path.join(self._options.mediaDir, path.sep);
            return new Promise((resolve, reject) => {
                fs.readdir(mediaDir, (error, results) => {
                    if (error) {
                        self.logger.error("FileServer.list error:%s", error);
                        return reject(error);
                    } else { // list files
                        // cut to max files
                        results = results.slice(0, options.limit);
                        // filter default ext
                        results = results.filter(item => {
                            return (item.indexOf('.mp3') > -1);
                        });
                        // format meta data
                        results = results.map(file => {
                            var filePath = path.join(self._options.mediaDir, path.sep, file);
                            var item = {
                                name: file,
                                path: filePath
                            };
                            const fd = fs.openSync(filePath, 'r');
                            var fstat = fs.fstatSync(fd);
                            // file size in bytes
                            item.size = fstat.size;
                            item.sizehr = self.formatSizeUnits(fstat.size);
                            // "Birth Time" Time of file creation. Set once when the file is created. 
                            item.birthtime = fstat.birthtime;
                            // "Modified Time" Time when file data last modified.
                            item.mtime = fstat.mtime;
                            // "Access Time" Time when file data last accessed.
                            item.atime = fstat.atime;
                            item.timestamp = new Date(item.mtime).getTime();
                            item.media_id = path.basename(filePath, '.mp3');

                            fs.closeSync(fd);//close file
                            return item;
                        });
                        if (options.desc) { // sort by most recent
                            results.sort(function (a, b) {
                                return b.timestamp - a.timestamp;
                            });
                        } else { // sort by older
                            results.sort(function (a, b) {
                                return a.timestamp - b.timestamp;
                            });
                        }
                        return resolve(results);
                    }
                })
            });
        }

这样,每个文件我都会得到一个项目数组

{
  "name": "sample121.mp3",
  "path": "/data/sample121.mp3",
  "size": 5751405,
  "sizehr": "5.4850 MB",
  "birthtime": "2018-10-08T15:26:08.397Z",
  "mtime": "2018-10-08T15:26:11.650Z",
  "atime": "2018-10-10T09:01:48.534Z",
  "timestamp": 1539012371650,
  "media_id": "sample121"
}

也就是说,问题在于,当要列出的文件夹包含大量文件(例如,从一万到十万甚至更多)时,node.js fs.readdir可能会冻结Node I / O Loop。 这是一个已知问题-有关更多信息,请参见here。 还计划通过某种方式来改进fs.readdir,例如流媒体-有关此问题,请参见here

与此同时,我正在搜索此补丁,因为我的文件夹很大。 由于问题是事件循环被冻结,因此有人使用process.nextTick提出了一个解决方案,该解决方案我已在此处集成

FileServer.prototype.listLocalNextTick = function (params) {
            var self = this;
            var options = {
                limit: 100,
                desc: 1
            };
            // override defaults
            for (var attrname in params) { options[attrname] = params[attrname]; }

            // media path is the media folder
            var mediaDir = path.join(self._options.mediaDir, path.sep);
            return new Promise((resolve, reject) => {
                var AsyncArrayProcessor = function (inArray, inEntryProcessingFunction) {
                    var elemNum = 0;
                    var arrLen = inArray.length;
                    var ArrayIterator = function () {
                        inEntryProcessingFunction(inArray[elemNum]);
                        elemNum++;
                        if (elemNum < arrLen) process.nextTick(ArrayIterator);
                    }
                    if (elemNum < arrLen) process.nextTick(ArrayIterator);
                }
                fs.readdir(mediaDir, function (error, results) {
                    if (error) {
                        self.logger.error("FileServer.list error:%s", error);
                        return reject(error);
                    }
                    // cut to max files
                    results = results.slice(0, options.limit);
                    // filter default ext
                    results = results.filter(item => {
                        return (item.indexOf('.mp3') > -1);
                    });
                    var ProcessDirectoryEntry = function (file) {
                        // This may be as complex as you may fit in a single event loop
                        var filePath = path.join(self._options.mediaDir, path.sep, file);
                        var item = {
                            name: file,
                            path: filePath
                        };
                        const fd = fs.openSync(filePath, 'r');
                        var fstat = fs.fstatSync(fd);
                        // file size in bytes
                        item.size = fstat.size;
                        item.sizehr = self.formatSizeUnits(fstat.size);
                        // "Birth Time" Time of file creation. Set once when the file is created. 
                        item.birthtime = fstat.birthtime;
                        // "Modified Time" Time when file data last modified.
                        item.mtime = fstat.mtime;
                        // "Access Time" Time when file data last accessed.
                        item.atime = fstat.atime;
                        item.timestamp = new Date(item.mtime).getTime();
                        item.media_id = path.basename(filePath, '.mp3');
                        // map to file item
                        file = item;
                    }//ProcessDirectoryEntry
                    // LP: fs.readdir() callback is finished, event loop continues...
                    AsyncArrayProcessor(results, ProcessDirectoryEntry);
                    if (options.desc) { // sort by most recent
                        results.sort(function (a, b) {
                            return b.timestamp - a.timestamp;
                        });
                    } else { // sort by older
                        results.sort(function (a, b) {
                            return a.timestamp - b.timestamp;
                        });
                    }
                    return resolve(results);
                });
            });
        }//listLocalNextTick

这似乎避免了最初的问题,但是我再也无法通过以前使用文件处理程序将文件列表映射到项目,因为在文件列表上运行AsyncArrayProcessor时,因此ProcessDirectoryEntry在每个文件条目上,process.nextTick的异步特性导致我无法找回在先前的results函数中修改过的listLocal数组,在该函数中,我只是执行了迭代的array.map results数组。 如何修补listLocalNextTick使其表现得像listLocal却又保持process.nextTick的行为?

[更新]

根据提出的解决方案,这是迄今为止最好的实现方式:

       /**
         * Scan files in directory
         * @param {String} needle 
         * @param {object} options 
         * @returns {nodeStream}
         */
        scanDirStream : function(needle,params) {
            var options = {
                type: 'f',
                name: '*'
            };
            for (var attrname in params) { options[attrname] = params[attrname]; }
            return new Promise((resolve, reject) => {
                var opt=[needle];
                for (var k in options) {
                    var v = options[k];
                    if (!Util.empty(v)) {
                        opt.push('-' + k);
                        opt.push(v);
                    }
                };
                var data='';
                var listing = spawn('find',opt)
                listing.stdout.on('data', _data => {
                    var buff=Buffer.from(_data, 'utf-8').toString();
                    if(buff!='') data+=buff;
                })
                listing.stderr.on('data', error => {
                    return reject(Buffer.from(error, 'utf-8').toString());
                });
                listing.on('close', (code) => {
                    var res = data.split('\n');
                    return resolve(res);
                });
            });

用法示例:

scanDirStream(mediaRoot,{
        name: '*.mp3'
    })
    .then(results => {
        console.info("files:%d", results);
    })
    .catch(error => {
        console.error("error %s", error);
    });

最终可以对其进行修改,以在在目录中获取新文件时在发出的每个stdout.on事件中添加一个滴答回调。

1 个答案:

答案 0 :(得分:1)

我已经为find创建了一个包装器,但是您可以以相同的方式使用dir或ls。

const { spawn } = require('child_process');

/**
 * findNodeStream
 * @param {String} dir 
 * @returns {nodeStream}
 */
const findNodeStream = (dir,options) => spawn('find',[dir,options].flat().filter(x=>x));

/**
 * Usage Example:
  let listing = findNodeStream('dir',[options])
  listing.stdout.on('data', d=>console.log(d.toString()))
  listing.stderr.on('data', d=>console.log(d.toString()))
  listing.on('close', (code) => {
    console.log(`child process exited with code ${code}`);
  });
*/

这允许您流式传输分块目录,而不是像fs.readdir那样整体传输。