解析大型xml.gz流时出现Node.js内存不足错误

时间:2019-05-25 23:34:13

标签: node.js performance stream out-of-memory fs

我具有以下xml.gz解析逻辑,该逻辑遍历〜2000个xml.gz文件(作为流),其中包含大约25,000个节点或“化合物”,我一次将它们批量加载到mongodb 1000中。

const zlib = require('zlib')
const fs = require('fs')
const sax = require('sax')
const MongoClient = require('mongodb').MongoClient

// Connection URL
const url = 'mongodb://localhost:27017'

// Database Name
const dbName = 'foo'

const paths = fs.readdirSync('./')

MongoClient.connect(url, function(err, client) {
  console.log("Connected successfully to server")

  const db = client.db(dbName)
  const compounds = db.collection('compounds')
  compounds.createIndex(
    { "visited": 1 },
    null,
    function(err, results) {
      console.log(results)
      perform(compounds, paths)
    }
  )
})

function perform(compounds, paths) {
  var i = 0
  var log = fs.createWriteStream('log.csv')
  log.write('path\n')
  next()

  function next() {
    var name = paths[i++]
    if (!name) return console.log('done')

    doit(log, compounds, name, function(){
      setImmediate(next)
    })
  }
}

function doit(log, collection, name, done) {
  console.log('parsing...', name)
  log.write(name + '\n')

  const gzip = zlib.createGunzip()
  const xmlStream = fs.createReadStream(name)
  const out = fs.createWriteStream('foo.xml')

  const saxStream = sax.createStream(false, {})

  saxStream.on('error', function(e){
    console.error('error!', e)
    this._parser.error = null
    this._parser.resume()
  })

  var compounds = []
  var compound
  var bondIndex = 0
  var atomIndex = 0
  var field
  var label
  var name
  var point
  var type
  var op
  var insert = 0

  function write(end) {
    if (compounds.length === 1000 || end) {
      console.log('writing', insert++)
      var inserts = compounds
      compounds = []
      collection.bulkWrite(inserts.map(compound => {
        compound.visited = false
        return {
          updateOne: {
            filter: { ncbiId: compound.ncbiId },
            upsert: true,
            update: { $set: compound }
          }
        }
      }),
      {},
      function(){

      })
    }
  }

  saxStream.on('text', function(text){
    switch (field) {
      case 'value':
        var value = text

        if (label.match(/IUPAC Name/i)) {
          if (name.match(/Systematic/i)) {
            compound.iupacSystematicName = value
          } else if (name.match(/Preferred/i)) {
            compound.iupacPreferredName = value
          } else if (name.match(/Markup/i)) {
            compound.iupacMarkupName = value
          } else if (name.match(/CAS.like Style/i)) {
            compound.iupacCASName = value
          } else if (name.match(/Allowed/i)) {
            compound.iupacAllowedName = value
          }
        } else if (label.match(/Fingerprint/i)) {
          compound.fingerprint = value
        } else if (label.match(/Compound Complexity/i)) {
          compound.complexity = parseFloat(value)
        } else if (label.match(/InChIKey/)) {
          compound.inchiKey = value
        } else if (label.match(/InChI/)) {
          compound.inchi = value
        } else if (label.match(/Log P/)) {
          compound.logP = parseFloat(value)
        } else if (label.match(/Mass/)) {
          compound.mass = parseFloat(value)
        } else if (label.match(/Molecular Formula/)) {
          compound.formula = value
        } else if (label.match(/Molecular Weight/)) {
          compound.weight = value
        } else if (label.match(/SMILES/)) {
          if (name.match(/Canonical/)) {
            compound.smiles = value
          }
        } else {
          // console.log(label, name)
        }

        if (name) {
          if (name.match(/Hydrogen Bond Acceptor/)) {
            compound.hydrogenBondAcceptor = parseFloat(value)
          } else if (name.match(/Hydrogen Bond Donor/)) {
            compound.hydrogenBondDonor = parseFloat(value)
          } else if (name.match(/Rotatable Bond/)) {
            compound.rotatable = parseFloat(value)
          } else if (name.match(/Polar Surface Area/)) {
            compound.polarSurfaceArea = parseFloat(value)
          }
        }

        break
      case null:
      case undefined:
        break
      default:
        switch (type) {
          case 'int':
            point[field] = parseInt(text)
            break
          case 'float':
            point[field] = parseFloat(text)
            break
          default:
            point[field] = text
            break
        }
        break
    }

    if (op == 'getAtom') {
      point = compound.atoms[parseInt(text) - 1]
    } else if (op == 'setBondStart') {
      point.start = parseInt(text) - 1
    } else if (op == 'setBondEnd') {
      point.end = parseInt(text) - 1
    } else if (op == 'setBondOrder') {
      point.order = parseInt(text)
    } else if (op == 'getLabel') {
      label = text
    } else if (op == 'getName') {
      name = text
    }
  })

  saxStream.on('opentag', function(node){
    switch (node.name) {
      case 'PC-COMPOUNDTYPE_ID_CID':
        point = compound = {
          atoms: [],
          bonds: []
        }
        field = 'ncbiId'
        type = 'string'
        break
      case 'PC-ELEMENT':
        point = {}
        compound.atoms.push(point)
        field = 'element'
        type = 'int'
        break
      case 'PC-ATOMINT_AID':
        op = 'getAtom'
        break
      case 'PC-ATOMINT_VALUE':
        field = 'charge'
        type = 'int'
        break
      case 'PC-BONDS_AID1':
        bondIndex = 0
        break
      case 'PC-BONDS_AID1_E':
        point = {}
        op = 'setBondStart'
        compound.bonds.push(point)
        break
      case 'PC-BONDS_AID2':
        bondIndex = 0
        break
      case 'PC-BONDS_AID2_E':
        op = 'setBondEnd'
        point = compound.bonds[bondIndex++]
        break
      case 'PC-BONDS_ORDER':
        bondIndex = 0
        break
      case 'PC-BONDTYPE':
        op = 'setBondOrder'
        point = compound.bonds[bondIndex++]
        break
      case 'PC-CONFORMER_X':
        atomIndex = 0
        break
      case 'PC-CONFORMER_X_E':
        point = compound.atoms[atomIndex]
        field = 'x'
        type = 'float'
        break
      case 'PC-CONFORMER_Y':
        atomIndex = 0
        break
      case 'PC-CONFORMER_Y_E':
        point = compound.atoms[atomIndex]
        field = 'y'
        type = 'float'
        break
      case 'PC-COMPOUND_CHARGE':
        point = compound
        field = 'charge'
        type = 'float'
        break
      case 'PC-URN_LABEL':
        op = 'getLabel'
        break
      case 'PC-URN_NAME':
        op = 'getName'
        break
      case 'PC-INFODATA_VALUE_IVAL':
      case 'PC-INFODATA_VALUE_BINARY':
      case 'PC-INFODATA_VALUE_SVAL':
      case 'PC-INFODATA_VALUE_IVEC':
      case 'PC-INFODATA_VALUE_FVAL':
      case 'PC-INFODATA_VALUE_BVAL':
      case 'PC-INFODATA_VALUE_BVEC':
      case 'PC-INFODATA_VALUE_SLIST':
      case 'PC-INFODATA_VALUE_FVEC':
      case 'PC-INFODATA_VALUE_BITLIST':
      case 'PC-INFODATA_VALUE_DATE':
        field = 'value'
        break
    }

  })

  saxStream.on('end', function(){
    write(true)
    done()
  })

  saxStream.on('closetag', function(node){
    type = null
    field = null
    op = null

    switch (node) {
      case 'PC-COMPOUND':
        compounds.push(compound)
        write(false)
        compound = null
        point = null
        break
      case 'PC-INFODATA':
        name = null
        label = null
        value = null
        break
    }
  })

  var i = 0
  xmlStream.pipe(gzip).pipe(saxStream)
}

本质上可以归结为:

const zlib = require('zlib')
const fs = require('fs')
const sax = require('sax')
const MongoClient = require('mongodb').MongoClient

// Connection URL
const url = 'mongodb://localhost:27017'

// Database Name
const dbName = 'foo'

const paths = fs.readdirSync('.')

MongoClient.connect(url, function(err, client) {
  console.log("Connected successfully to server")

  const db = client.db(dbName)
  const compounds = db.collection('compounds')
  compounds.createIndex(
    { "visited": 1 },
    null,
    function(err, results) {
      console.log(results)
      perform(compounds, paths)
    }
  )
})

function perform(compounds, paths) {
  var i = 0
  var log = fs.createWriteStream('log.csv')
  log.write('path\n')
  next()

  function next() {
    var name = paths[i++]
    if (!name) return console.log('done')

    doit(log, compounds, name, function(){
      setImmediate(next)
    })
  }
}

function doit(log, collection, name, done) {
  console.log('parsing...', name)
  log.write(name + '\n')

  const gzip = zlib.createGunzip()
  const xmlStream = fs.createReadStream(name)
  const out = fs.createWriteStream('foo.xml')

  const saxStream = sax.createStream(false, {})

  saxStream.on('error', function(e){
    console.error('error!', e)
    this._parser.error = null
    this._parser.resume()
  })

  var compounds = []
  var compound
  var bondIndex = 0
  var atomIndex = 0
  var field
  var label
  var name
  var point
  var type
  var op
  var insert = 0

  function write(end) {
    // every 1000 I write
    if (compounds.length === 1000 || end) {
      var inserts = compounds
      compounds = []
      collection.bulkWrite(inserts.map(compound => {
        compound.visited = false
        return {
          updateOne: {
            filter: { ncbiId: compound.ncbiId },
            upsert: true,
            update: { $set: compound }
          }
        }
      }),
      {},
      function(){

      })
    }
  }

  saxStream.on('text', function(text){

  })

  saxStream.on('opentag', function(node){

  })

  saxStream.on('end', function(){
    write(true)
    done()
  })

  saxStream.on('closetag', function(node){
    compounds.push(compound)
    write(false)
  })

  var i = 0
  xmlStream.pipe(gzip).pipe(saxStream)
}

但是,我遇到了内存过载,显示v8内存错误:

FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory

Writing Node.js report to file: report.20190519.135156.12689.001.json
Node.js report completed
 1: 0x100063e23 node::Abort() [/usr/local/bin/node]
 2: 0x100064491 node::OnFatalError(char const*, char const*) [/usr/local/bin/node]
 3: 0x10017e24f v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node]
 4: 0x10017e1f0 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/usr/local/bin/node]
 5: 0x100440458 v8::internal::Heap::UpdateSurvivalStatistics(int) [/usr/local/bin/node]
 6: 0x100441e75 v8::internal::Heap::CheckIneffectiveMarkCompact(unsigned long, double) [/usr/local/bin/node]
 7: 0x10043f7ed v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector, v8::GCCallbackFlags) [/usr/local/bin/node]
 8: 0x10043e5e3 v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace, v8::internal::GarbageCollectionReason, v8::GCCallbackFlags) [/usr/local/bin/node]
 9: 0x1004464b1 v8::internal::Heap::AllocateRawWithLightRetry(int, v8::internal::AllocationSpace, v8::internal::AllocationAlignment) [/usr/local/bin/node]
10: 0x100446500 v8::internal::Heap::AllocateRawWithRetryOrFail(int, v8::internal::AllocationSpace, v8::internal::AllocationAlignment) [/usr/local/bin/node]
11: 0x1004268c7 v8::internal::Factory::NewFillerObject(int, bool, v8::internal::AllocationSpace) [/usr/local/bin/node]
12: 0x100603e6e v8::internal::Runtime_AllocateInNewSpace(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/bin/node]
13: 0x34613a14fc7d 
Abort trap: 6

问题是在流传输这些xml文件并解析它们时我做错了什么。我可以采取其他措施以免遇到内存错误。也许这是因为使用后我没有关闭流吗?我不知道。

0 个答案:

没有答案