从云存储中读取大文件并写入数据存储区

时间:2018-02-24 16:41:19

标签: node.js google-cloud-datastore google-cloud-storage google-cloud-functions

我有这个云功能,它是从云存储中的存储桶触发的。它读取文件,使用N3将每一行转换为rdf三元组,然后将生成的三元组写入云存储。

由于它将整个文件下载到内存中,因此不适合大文件。如何更改此功能以一次执行此操作?

const storage = require('@google-cloud/storage')();
const Datastore = require('@google-cloud/datastore');
const N3 = require('n3');

helloGCS = (event, callback) => {
    const file = event.data;

    if (file.resourceState === 'not_exists') {
      console.log(`File ${file.name} deleted.`);
      callback(null, 'ok');
    } else if (file.metageneration === '1') {
      // metageneration attribute is updated on metadata changes.
      // on create value is 1
      console.log(`File ${file.name} uploaded.`);
      let parser = N3.Parser();
      const bucket = storage.bucket('woburn-advisory-ttl');
      const remoteFile = bucket.file(file.name);
      const datastore = new Datastore({});
      let number_of_rows = 0;
      remoteFile.download()
          .then(data => {   // convert buffer to string
              if (data) {
                  lines = data.toString().split('\n')
                  console.log(lines.length)
                  entities = lines.map(line=>{
                      let triple = parser.parse(line)[0];
                      if (triple) {
//                          console.log(triple)
                          const tripleKey = datastore.key('triple');
                          let entity = {
                              key: tripleKey,
                              data: [
                                  {
                                      name: 'subject',
                                      value: triple.subject
                                  },
                                  {
                                      name: 'predicate',
                                      value: triple.predicate
                                  },
                                  {
                                      name: 'object',
                                      value: triple.object
                                  }
                              ]
                          }
                          return entity
                      }
                      else {
                          return false
                  }})
                  entities = entities.filter((entity)=>{return entity})
                  console.log(entities.length)
                  datastore.save(entities)
                  .then((response)=>{
                      console.log(`Triples created successfully. but... ${response}`);
                      res.send(`${entities.length} triples created`)
                  })
              }
              callback(null, 'ok');
          })
    }
     else {
        console.log(`File ${file.name} metadata updated.`);
        callback(null, 'ok');
    }
};

1 个答案:

答案 0 :(得分:1)

而不是调用download()使用createReadStream()。这允许您循环遍历整个文件而不将其存储在内存中。您可以使用bylinereadline之类的内容从该流中获取单独的行。

总的来说,这看起来像是:

gcsStream = remoteFile.createReadStream();
lineStream = byline.createStream(gcsStream);
lineStream.on('data', function(line) {
   let triple = parser.parse(line)[0];
   //...
});