使用Node.js将大量文件上传到Google存储桶

时间:2018-09-27 23:52:00

标签: node.js google-app-engine google-cloud-storage

我需要将约300,000个文件(约1.5 TB)传输到Google Cloud存储桶。

挑战:

  • 这是来自远程来源,因此可以在以下情况下使用代理 具有上传功能?
  • 这可能需要几天时间。那么如何做到最好 运行这么长时间的过程? App Engine服务,Compute Engine实例, 还有什么?

这是我要使用的代码:

// Imports the Google Cloud client library.
const Storage = require('@google-cloud/storage');
const db = require('../models');

// Instantiates a client. If you don't specify credentials when constructing
// the client, the client library will look for credentials in the
// environment.
const storage = new Storage();

const bucketName = "bucket-name";
const increment = 5;

let globalCounter = 0;

function getPDFLinks(){
    return new Promise((resolve, reject) => {
        const dbRecords = [/*Array of URLs from our db*/];
        const dlLinkArray = dbRecords.map(link => ({
            link: 'https://sample.domain.com' + link.dataValues.downloadLink,
            filename: link.dataValues.contentID
        }));
        console.log("dlLinkArray Length: ", dlLinkArray.length);
        downloadPDFsSlow(dlLinkArray, 0)
        .then(x => {
            console.log("finished all downloads and updated SQL");
            resolve(x);
        })
        .catch(e => {
            console.error(e);
            reject(e);
        });
    });
}

function downloadPDFsSlow(linksArray, counter){ //<increment> promises at a time. x{link: url, filename: contnetID}
    return new Promise((resolve, reject) => {
        Promise.all(linksArray.slice(counter, counter+increment).map(x => uploadFile(bucketName, x.link, x.filename) ))
        .then(() => {
            //console.log("Map uploadFile results: ", x);
            globalCounter++;
            console.log('globalCounter: ', globalCounter);
            if(linksArray.length > counter){ //have not reached the end of URLs
                const toUpdate = linksArray.slice(counter, counter+increment).map(x => x.filename);
                updateRecords(toUpdate); //ASYNC
                setTimeout(() => downloadPDFsSlow(linksArray, counter+increment), 1000);
            }
            else{ //Reached the end
                console.log("DONE");
                resolve(`downloadPDFsSlow completed ${linksArray.length} records`);
            }
        })
        .catch(e => {
            console.error(e);
            //log error, log slide of arrays that caused error, call next slice of downloadPDFsSlow
            if(linksArray.length > counter){
                console.log("Skipping to next. Counter: ", counter);
                    setTimeout(() => downloadPDFsSlow(linksArray, counter+increment), 1000);
            }else{
                reject('downloadPDFsSlow FAILED at the end' + JSON.stringify(e));
            }
        });
    });
}

function uploadFile(bucketName, fileURL, reName) { //uploads fileURL, deletes if too small, else renames to contentID.pdf
    return new Promise( (resolve, reject) => {
        /**
         * TODO(developer): Uncomment the following lines before running the sample.
         */
        // const bucketName = 'Name of a bucket, e.g. my-bucket';
        // const filename = 'Local file to upload, e.g. ./local/path/to/file.txt';

        // Uploads a local file to the bucket
        storage
            .bucket(bucketName)
            .upload(fileURL, {
            // Support for HTTP requests made with `Accept-Encoding: gzip`
            gzip: true,
            metadata: {
                // Enable long-lived HTTP caching headers
                // Use only if the contents of the file will never change
                // (If the contents will change, use cacheControl: 'no-cache')
                cacheControl: 'public, max-age=31536000',
            },
            })
            .then((x) => {
                console.log("SIZE: ", x[1].size);
                if(x[1].size <= 202 || (x[1].size <= 13236 && x[1].size >= 13234)){ //check file size, if small then PDF not available
                    deleteFile(bucketName, x[1].name)
                    .then(d => resolve(d))
                    .catch(e => reject(e));
                }else{
                    //console.log(`${fileURL} uploaded to ${bucketName}.`);
                    renameFile(bucketName, x[1].name, "pdf/" + reName + ".pdf")
                    .then( renameResult => {
                        //console.log(renameResult);
                        resolve(x);
                    })
                    .catch(e => reject(e));
                }
            })
            .catch(err => {
                console.error('ERROR:', err);
                reject(err);
            });
        // [END storage_upload_file]
    });
}
  
function listFiles(bucketName) {

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const bucketName = 'Name of a bucket, e.g. my-bucket';

// Lists files in the bucket
storage
    .bucket(bucketName)
    .getFiles()
    .then(results => {
    const files = results[0];

    console.log('Files:');
    files.forEach(file => {
        console.log(file.name);
    });
    })
    .catch(err => {
    console.error('ERROR:', err);
    });
// [END storage_list_files]
}

function renameFile(bucketName, srcFilename, destFilename) {
    return new Promise( (resolve, reject) => {
        /**
         * TODO(developer): Uncomment the following lines before running the sample.
         */
        // const bucketName = 'Name of a bucket, e.g. my-bucket';
        // const srcFilename = 'File to move, e.g. file.txt';
        // const destFilename = 'Destination for file, e.g. moved.txt';

        // Moves the file within the bucket
        storage
            .bucket(bucketName)
            .file(srcFilename)
            .move(destFilename)
            .then((x) => {
                console.log(
                    `gs://${bucketName}/${srcFilename} moved to gs://${bucketName}/${destFilename}.`
                );
                resolve(x);
            })
            .catch(err => {
                console.error('ERROR:', err);
                reject(err);
            });
        // [END storage_move_file]
    });
}

function deleteFile(bucketName, filename) {
    return new Promise( (resolve, reject) => {
        /**
         * TODO(developer): Uncomment the following lines before running the sample.
         */
        // const bucketName = 'Name of a bucket, e.g. my-bucket';
        // const filename = 'File to delete, e.g. file.txt';
    
        // Deletes the file from the bucket
        storage
        .bucket(bucketName)
        .file(filename)
        .delete()
        .then((x) => {
            console.log(`gs://${bucketName}/${filename} deleted.`);
            resolve(x);
        })
        .catch(err => {
            console.error('ERROR:', err);
            reject(err);
        });
        // [END storage_delete_file]
    });
}

function updateRecords(recordsToUpdate){
    db.sequelize.sync({force: false}).then(function(){
        Promise.all(recordsToUpdate.map(x => db.Record.update({localFile: x + '.pdf'}, {where: { contentID: x }})))
        .then(() => {
            console.log("Updated filename");
            //db.sequelize.close();
        })
        .catch(e => console.error(e));
    });
}

//EXECUTE
getPDFLinks()
    .then(x => {
        console.log("getPDFLinks COMPLETE");
        console.log(x);
    })
    .catch(e => {
        console.error("getPDFLinks FAILED");
        console.error(e);
    });

1 个答案:

答案 0 :(得分:0)

我建议您研究任务队列(Cloud Tasks)。

一个好的方法是为您的每个dbRecords(也许是批次)创建Tasks。然后,工作进程会拉出每个文件并应用您的转换,然后再将结果保存到GCS。这种方法为您提供了并行性,较短的请求,异步和重试。

https://www.npmjs.com/package/@google-cloud/tasks