Question

我的图书馆：

const Promise = require('bluebird');
const fs = Promise.promisifyAll(require('graceful-fs'));
const path = require('path');
const xml2js = Promise.promisifyAll(require('xml2js'));

我有大量要解析的XML文件。我可以使用此函数创建一个指向所有文件的路径数组：

function getFileNames(rootPath) {
  // Read content of path
  return fs.readdirAsync(rootPath)
    // Return all directories
    .then(function(content) {
      return content.filter(function(file) {
        return fs.statSync(path.join(rootPath, file)).isDirectory();
      });
    })
    // For every directory
    .map(function(directory) {
      // Save current path
      let currentPath = path.join(rootPath, directory);
      // Read files in the directory
      return fs.readdirAsync(currentPath)
        // Filter out the XMLs
        .filter(function(file) {
          return path.extname(file) === '.XML';
        })
        // Return path to file
        .map(function(file) {
          return path.join(rootPath, directory, file);
        });
    })
    // Flatten array of results
    .reduce(function(a, b) {
      return a.concat(b);
    });
}

现在我想通过每个文件解析它。

我有2个功能：

function openFile(filePath) {
 return fs.readFileAsync('./' + filePath)
  .then(function(fileData) {
    return fileData;
  });
}

function parseFile(data) {
  return xml2js.parseStringAsync(data)
      .then(function(xmlObject) {
        return xmlObject;
      });
}

现在当我使用.map调用它时（GetFileNames函数输出一个包含超过20k字符串和文件路径的数组）函数：

getFileNames('./XML')
  .map(function(file) {
    openFile(file)
      .then(function(data) {
        parseFile(data)
            .then(function(object) {
              console.log(object);
            });
      });
  });

我得到一个javascript堆内存错误：

致命错误：CALL_AND_RETRY_LAST分配失败 - JavaScript堆内存不足

但是当我通过传递到实际文件的路径一次运行该函数时：

openFile('./XML/2016-10-1/EUROTIPOLD2016-10-1T00-00-22.5756240530.XML')
  .then(function(data) {
    parseFile(data)
        .then(function(object) {
          console.log(object);
        });
  });

我得到了所需的输出。

我做错了什么？

Answer 1

迭代nK文件是异步的。

1）您正在获取文件列表

2）通过执行.map您正在调用openFile，parseFile这些是异步函数，并且需要时间来阅读和解析。

因此，由于异步性，它进入下一个文件而无需等待完成前一个文件调用垃圾收集器来扫描内存，这里存在内存不足的问题。

考虑一次读取不同大小的20K文件。

所以这是解决方案：

使用async同步（eachSeries）或控制（eachLimit）迭代。

const async = require('async'); // install: npm i --save async

let files = getFileNames('./XML');

// eachLimit(files, 3,
async.eachSeries(files, 
  (file, next) => { 
    openFile(file) 
     .then(
       parseFile, 
       (err) => {
         console.error('Cannot open file:', file, err);
         next();
       }) 
     .then(
       object => { // successfully parsed file, so log it out and proceed to next file
         console.log(object);
         next();
       }, 
       (err) => {
         console.error('Cannot parse data from file:', file, err);
         next();
       });
});

P.S。我可以随意评论并修复代码问题。

Answer 2

这是一个简单的案例，需要为您的工作负载提供更多资源。我会考虑增加堆大小以满足您的需求，而不是更改源代码。

我建议相应地设置--max_old_space_size以满足要求 - 但这可能是一个迭代过程。

希望这有帮助。

Javascript - .map耗尽内存

2 个答案: