我有这个云功能,它是从云存储中的存储桶触发的。它读取文件,使用N3将每一行转换为rdf三元组,然后将生成的三元组写入云存储。
由于它将整个文件下载到内存中,因此不适合大文件。如何更改此功能以一次执行此操作?
const storage = require('@google-cloud/storage')();
const Datastore = require('@google-cloud/datastore');
const N3 = require('n3');
helloGCS = (event, callback) => {
const file = event.data;
if (file.resourceState === 'not_exists') {
console.log(`File ${file.name} deleted.`);
callback(null, 'ok');
} else if (file.metageneration === '1') {
// metageneration attribute is updated on metadata changes.
// on create value is 1
console.log(`File ${file.name} uploaded.`);
let parser = N3.Parser();
const bucket = storage.bucket('woburn-advisory-ttl');
const remoteFile = bucket.file(file.name);
const datastore = new Datastore({});
let number_of_rows = 0;
remoteFile.download()
.then(data => { // convert buffer to string
if (data) {
lines = data.toString().split('\n')
console.log(lines.length)
entities = lines.map(line=>{
let triple = parser.parse(line)[0];
if (triple) {
// console.log(triple)
const tripleKey = datastore.key('triple');
let entity = {
key: tripleKey,
data: [
{
name: 'subject',
value: triple.subject
},
{
name: 'predicate',
value: triple.predicate
},
{
name: 'object',
value: triple.object
}
]
}
return entity
}
else {
return false
}})
entities = entities.filter((entity)=>{return entity})
console.log(entities.length)
datastore.save(entities)
.then((response)=>{
console.log(`Triples created successfully. but... ${response}`);
res.send(`${entities.length} triples created`)
})
}
callback(null, 'ok');
})
}
else {
console.log(`File ${file.name} metadata updated.`);
callback(null, 'ok');
}
};
答案 0 :(得分:1)
而不是调用download()
使用createReadStream()
。这允许您循环遍历整个文件而不将其存储在内存中。您可以使用byline或readline之类的内容从该流中获取单独的行。
总的来说,这看起来像是:
gcsStream = remoteFile.createReadStream();
lineStream = byline.createStream(gcsStream);
lineStream.on('data', function(line) {
let triple = parser.parse(line)[0];
//...
});