我需要将约300,000个文件(约1.5 TB)传输到Google Cloud存储桶。
挑战:
这是我要使用的代码:
// Imports the Google Cloud client library.
const Storage = require('@google-cloud/storage');
const db = require('../models');
// Instantiates a client. If you don't specify credentials when constructing
// the client, the client library will look for credentials in the
// environment.
const storage = new Storage();
const bucketName = "bucket-name";
const increment = 5;
let globalCounter = 0;
function getPDFLinks(){
return new Promise((resolve, reject) => {
const dbRecords = [/*Array of URLs from our db*/];
const dlLinkArray = dbRecords.map(link => ({
link: 'https://sample.domain.com' + link.dataValues.downloadLink,
filename: link.dataValues.contentID
}));
console.log("dlLinkArray Length: ", dlLinkArray.length);
downloadPDFsSlow(dlLinkArray, 0)
.then(x => {
console.log("finished all downloads and updated SQL");
resolve(x);
})
.catch(e => {
console.error(e);
reject(e);
});
});
}
function downloadPDFsSlow(linksArray, counter){ //<increment> promises at a time. x{link: url, filename: contnetID}
return new Promise((resolve, reject) => {
Promise.all(linksArray.slice(counter, counter+increment).map(x => uploadFile(bucketName, x.link, x.filename) ))
.then(() => {
//console.log("Map uploadFile results: ", x);
globalCounter++;
console.log('globalCounter: ', globalCounter);
if(linksArray.length > counter){ //have not reached the end of URLs
const toUpdate = linksArray.slice(counter, counter+increment).map(x => x.filename);
updateRecords(toUpdate); //ASYNC
setTimeout(() => downloadPDFsSlow(linksArray, counter+increment), 1000);
}
else{ //Reached the end
console.log("DONE");
resolve(`downloadPDFsSlow completed ${linksArray.length} records`);
}
})
.catch(e => {
console.error(e);
//log error, log slide of arrays that caused error, call next slice of downloadPDFsSlow
if(linksArray.length > counter){
console.log("Skipping to next. Counter: ", counter);
setTimeout(() => downloadPDFsSlow(linksArray, counter+increment), 1000);
}else{
reject('downloadPDFsSlow FAILED at the end' + JSON.stringify(e));
}
});
});
}
function uploadFile(bucketName, fileURL, reName) { //uploads fileURL, deletes if too small, else renames to contentID.pdf
return new Promise( (resolve, reject) => {
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const bucketName = 'Name of a bucket, e.g. my-bucket';
// const filename = 'Local file to upload, e.g. ./local/path/to/file.txt';
// Uploads a local file to the bucket
storage
.bucket(bucketName)
.upload(fileURL, {
// Support for HTTP requests made with `Accept-Encoding: gzip`
gzip: true,
metadata: {
// Enable long-lived HTTP caching headers
// Use only if the contents of the file will never change
// (If the contents will change, use cacheControl: 'no-cache')
cacheControl: 'public, max-age=31536000',
},
})
.then((x) => {
console.log("SIZE: ", x[1].size);
if(x[1].size <= 202 || (x[1].size <= 13236 && x[1].size >= 13234)){ //check file size, if small then PDF not available
deleteFile(bucketName, x[1].name)
.then(d => resolve(d))
.catch(e => reject(e));
}else{
//console.log(`${fileURL} uploaded to ${bucketName}.`);
renameFile(bucketName, x[1].name, "pdf/" + reName + ".pdf")
.then( renameResult => {
//console.log(renameResult);
resolve(x);
})
.catch(e => reject(e));
}
})
.catch(err => {
console.error('ERROR:', err);
reject(err);
});
// [END storage_upload_file]
});
}
function listFiles(bucketName) {
/**
* TODO(developer): Uncomment the following line before running the sample.
*/
// const bucketName = 'Name of a bucket, e.g. my-bucket';
// Lists files in the bucket
storage
.bucket(bucketName)
.getFiles()
.then(results => {
const files = results[0];
console.log('Files:');
files.forEach(file => {
console.log(file.name);
});
})
.catch(err => {
console.error('ERROR:', err);
});
// [END storage_list_files]
}
function renameFile(bucketName, srcFilename, destFilename) {
return new Promise( (resolve, reject) => {
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const bucketName = 'Name of a bucket, e.g. my-bucket';
// const srcFilename = 'File to move, e.g. file.txt';
// const destFilename = 'Destination for file, e.g. moved.txt';
// Moves the file within the bucket
storage
.bucket(bucketName)
.file(srcFilename)
.move(destFilename)
.then((x) => {
console.log(
`gs://${bucketName}/${srcFilename} moved to gs://${bucketName}/${destFilename}.`
);
resolve(x);
})
.catch(err => {
console.error('ERROR:', err);
reject(err);
});
// [END storage_move_file]
});
}
function deleteFile(bucketName, filename) {
return new Promise( (resolve, reject) => {
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const bucketName = 'Name of a bucket, e.g. my-bucket';
// const filename = 'File to delete, e.g. file.txt';
// Deletes the file from the bucket
storage
.bucket(bucketName)
.file(filename)
.delete()
.then((x) => {
console.log(`gs://${bucketName}/${filename} deleted.`);
resolve(x);
})
.catch(err => {
console.error('ERROR:', err);
reject(err);
});
// [END storage_delete_file]
});
}
function updateRecords(recordsToUpdate){
db.sequelize.sync({force: false}).then(function(){
Promise.all(recordsToUpdate.map(x => db.Record.update({localFile: x + '.pdf'}, {where: { contentID: x }})))
.then(() => {
console.log("Updated filename");
//db.sequelize.close();
})
.catch(e => console.error(e));
});
}
//EXECUTE
getPDFLinks()
.then(x => {
console.log("getPDFLinks COMPLETE");
console.log(x);
})
.catch(e => {
console.error("getPDFLinks FAILED");
console.error(e);
});
答案 0 :(得分:0)
我建议您研究任务队列(Cloud Tasks)。
一个好的方法是为您的每个dbRecords(也许是批次)创建Tasks。然后,工作进程会拉出每个文件并应用您的转换,然后再将结果保存到GCS。这种方法为您提供了并行性,较短的请求,异步和重试。