Question

我正在编写一个备份脚本，只需下载特定Azure帐户的所有blob容器中的所有blob。

该脚本使用async.js来确保只有这么多线程可以同时运行，因此它不会使服务器过载。当我运行这个脚本时，它工作正常，但是当它遇到大文件时，它会耗尽内存。我猜测下载运行速度比磁盘写入速度快，并且它最终填满了内存缓冲区，以至于我完全耗尽了内存，但到目前为止调试确切的原因是不可能的。

看起来使用大量内存的特定功能如下所示：

blobService.getBlobToStream(
  containerName,
  blob.name,
  fs.createWriteStream(fullPath),
  function(error) {
    if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
    console.log("Failed writing " + blob.name + " (" + error + ")");
    callback();
    }
    else if(!error) { //Write the last modified date and finish the queue item silently
    fs.writeFile(fullPath + ".date", blobLastModified, function(err)
    { if(err) console.log("Couldn't write .date file: " + err); });
    callback();
    }
    });

即使只有700MB的下载也很容易让我的内存占用1GB。

这有什么办法吗？我错过了一个参数，它会神奇地阻止Azure SDK缓冲所有内容和厨房水槽吗？

完整代码：

#!/usr/bin/env node

//Requires
var azure = require('azure');
var fs    = require('fs');
var mkdirp = require('mkdirp');
var path  = require('path');
var async = require('async');

var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.

var blobService = azure.createBlobService();

backupPrefix='/backups/azurebackup/' //Always end with a '/'!!

//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
 function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
  blobService.listBlobs(containerName,
   function(error, blobs) {
     if(!error){
        var blobProcessingQueue =
         async.queue(function(index,callback) {
                var blob = blobs[index];
                console.log(blob); //DEBUG
                var fullPath = backupPrefix + containerName + '/' + blob.name;
                var blobLastModified = new Date(blob.properties['last-modified']);

                //Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
                if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
                        mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
                        }


                if(fs.existsSync(fullPath + ".date")){
                        if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
                                callback();
                                return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
                                }
                        }

                blobService.getBlobToStream(
                  containerName,
                  blob.name,
                  fs.createWriteStream(fullPath),
                  function(error) {
                        if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
                                console.log("Failed writing " + blob.name + " (" + error + ")");
                                callback();
                                }
                        else if(!error) { //Write the last modified date and finish the queue item silently
                                fs.writeFile(fullPath + ".date", blobLastModified, function(err)
                                { if(err) console.log("Couldn't write .date file: " + err); });
                                callback();
                                }
                           });

                },maxconcurrency);

        for(var blobindex in blobs){
                blobProcessingQueue.push(blobindex);
                 } //Push new items to the queue for processing



        }
        else {
         console.log("An error occurred listing the blobs: " + error);
        }
});
},1);

blobService.listContainers(function(err, result){
        for(var i=0;i<result.length;i++) {
                containerProcessingQueue.push(result[i].name);
        }
});

Answer 1

对于现在好奇的所有人来说，开始和结束的变量都已经改变了。它们现在只是rangeStart和rangeEnd。这是azure节点文档以获得更多帮助 http://dl.windowsazure.com/nodestoragedocs/BlobService.html

Answer 2

您可能做的一件事是只将一大块数据读入流而不是整个blob数据，将其附加到文件并读取下一个块。 Blob Storage服务支持这一点。如果您查看getBlobToStream（https://github.com/WindowsAzure/azure-sdk-for-node/blob/master/lib/services/blob/blobservice.js）的源代码，则可以在选项中指定from / to bytes - rangeStartHeader和rangeEndHeader。看看是否有帮助。

我已经破解了一些代码（正如你从我的代码中看到的，我对node.js的了解非常原始:)）。 [请使用此代码，以便了解如何进行分块下载，因为我认为它仍然存在一些问题]

var azure = require('azure');
var fs = require('fs');

var blobService = azure.createBlobService("account", "accountkey");
var containerName = "container name";
var blobName = "blob name";
var blobSize;
var chunkSize = 1024 * 512;//chunk size -- we'll read 512 KB at a time.
var startPos = 0;
var fullPath = "D:\\node\\";
var blobProperties = blobService.getBlobProperties(containerName, blobName, null, function (error, blob) {
        if (error) {
            throw error;
        }
        else    {
            blobSize = blob.contentLength;
            fullPath = fullPath + blobName;
            console.log(fullPath);
            doDownload();
        }
    }
);

function doDownload() {
    var stream = fs.createWriteStream(fullPath, {flags: 'a'});
    var endPos = startPos + chunkSize;
    if (endPos > blobSize) {
        endPos = blobSize;
    }
    console.log("Downloading " + (endPos - startPos) + " bytes starting from " + startPos + " marker.");
    blobService.getBlobToStream("test", blobName, stream, 
        { "rangeStartHeader": startPos, "rangeEndHeader": endPos-1 }, function(error) {
        if (error) {
            throw error;
        }
        else if (!error) {
            startPos = endPos;
            if (startPos <= blobSize - 1) {
                doDownload();
            }
        }
    });
}

node js azure SDK getBlobToStream使用大量内存

2 个答案: