如何在表中为DynamoDB写入超过25个项目/行?

时间:2015-06-26 05:55:06

标签: database amazon-web-services amazon-dynamodb amazon-redshift

我是Amazon DynamoDB的新手。我目前有20000行需要添加到表中。但是,基于我所读到的内容,我似乎只能使用带有25个WriteRequests的BatchWriteItem类一次最多写入25行。有可能增加这个吗?我怎样一次写超过25行?目前大约需要15分钟来写入所有20000行。谢谢。

5 个答案:

答案 0 :(得分:5)

我正在寻找一些使用JavaScript SDK执行此操作的代码。我找不到它,所以我自己把它放在一起。我希望这有助于其他人!

function multiWrite(table, data, cb) {
    var AWS = require('aws-sdk');
    var db = new AWS.DynamoDB.DocumentClient({region: 'us-east-1'});

    // Build the batches
    var batches = [];
    var current_batch = [];
    var item_count = 0;
    for(var x in data) {
        // Add the item to the current batch
        item_count++;
        current_batch.push({
            PutRequest: {
                Item: data[x]
            }
        });
        // If we've added 25 items, add the current batch to the batches array
        // and reset it
        if(item_count%25 == 0) {
            batches.push(current_batch);
            current_batch = [];
        }
    }
    // Add the last batch if it has records and is not equal to 25
    if(current_batch.length > 0 && current_batch.length != 25) batches.push(current_batch);

    // Handler for the database operations
    var completed_requests = 0;
    var errors = false;
    function handler(request) {
        return function(err, data) {
            // Increment the completed requests
            completed_requests++;

            // Set the errors flag
            errors = (errors) ? true : err;

            // Log the error if we got one
            if(err) {
                console.error(JSON.stringify(err, null, 2));
                console.error("Request that caused database error:");
                console.error(JSON.stringify(request, null, 2));
            }

            // Make the callback if we've completed all the requests
            if(completed_requests == batches.length) {
                cb(errors);
            }
        }
    }

    // Make the requests
    var params;
    for(x in batches) {
        // Items go in params.RequestItems.id array
        // Format for the items is {PutRequest: {Item: ITEM_OBJECT}}
        params = '{"RequestItems": {"' + table + '": []}}';
        params = JSON.parse(params);
        params.RequestItems[table] = batches[x];

        // Perform the batchWrite operation
        db.batchWrite(params, handler(params));
    }
}

答案 1 :(得分:4)

您只能在一个BatchWriteItem请求中发送最多25个项目,但您可以一次发送任意数量的BatchWriteItem请求。假设您已经provisioned enough write throughput,您应该能够通过在多个线程/进程/主机之间拆分这些20k行并将它们并行推送到数据库来显着加快速度。

对于那么小的数据集,它可能有点重量级,但您可以使用AWS Data Pipeline从S3中提取数据。它基本上自动化了创建Hadoop集群的过程,以便从S3中吸取数据,并在一堆并行的BatchWriteItem请求中将其发送到DynamoDB。

答案 2 :(得分:0)

function putInHistory(data,cb) {
  var arrayOfArray25 = _.chunk(data, 25);
  async.every(arrayOfArray25, function(arrayOf25, callback) {
   var params = {
     RequestItems: {
    [TABLES.historyTable]: []
   }
 };
 arrayOf25.forEach(function(item){
  params.RequestItems[TABLES.historyTable].push({
    PutRequest: {
      Item: item
    }
  })
 });
 docClient.batchWrite(params, function(err, data) {
   if (err){ 
     console.log(err);
     callback(err);
   } else {
     console.log(data);
     callback(null, true);
   };
 });
}, function(err, result) {
 if(err){
   cb(err);
 } else {
   if(result){
     cb(null,{allWritten:true});
   } else {
    cb(null,{allWritten:false});
   }
 }
});
}

您可以使用lodash从数组中创建数据块,然后使用async库的每个/每个方法在25个元素的块上执行batchWrite

答案 3 :(得分:0)

来自@Geerek的答案是带有lambda函数的解决方案:

exports.handler = (event, context, callback) => {
  console.log(`EVENT: ${JSON.stringify(event)}`);

  var AWS = require('aws-sdk');

  AWS.config.update({ region: process.env.REGION })

  var docClient = new AWS.DynamoDB.DocumentClient();

  const {data, table, cb} = event

  // Build the batches
  var batches = [];
  var current_batch = [];
  var item_count = 0;

  for (var i = 0; i < data.length; i++) {
    // Add the item to the current batch
    item_count++
    current_batch.push({
      PutRequest: {
        Item: data[i],
      },
    })
    // If we've added 25 items, add the current batch to the batches array
    // and reset it
    if (item_count % 25 === 0) {
      batches.push(current_batch)
      current_batch = []
    }
  }

  // Add the last batch if it has records and is not equal to 25
  if (current_batch.length > 0 && current_batch.length !== 25) {
    batches.push(current_batch)
  }

  // Handler for the database operations
  var completed_requests = 0
  var errors = false

  function handler (request) {

    console.log('in the handler: ', request)

    return function (err, data) {
      // Increment the completed requests
      completed_requests++;

      // Set the errors flag
      errors = (errors) ? true : err;

      // Log the error if we got one
      if(err) {
        console.error(JSON.stringify(err, null, 2));
        console.error("Request that caused database error:");
        console.error(JSON.stringify(request, null, 2));
        callback(err);
      }else {
        callback(null, data);
      }

      // Make the callback if we've completed all the requests
      if(completed_requests === batches.length) {
        cb(errors);
      }
    }
  }

  // Make the requests
  var params;
  for (var j = 0; j < batches.length; j++) {
    // Items go in params.RequestItems.id array
    // Format for the items is {PutRequest: {Item: ITEM_OBJECT}}
    params = '{"RequestItems": {"' + table + '": []}}'
    params = JSON.parse(params)
    params.RequestItems[table] = batches[j]

    console.log('before db.batchWrite: ', params)

    // Perform the batchWrite operation
    docClient.batchWrite(params, handler(params))
  }
};

答案 4 :(得分:0)

我编写了一个 npm package,它应该可以作为 batchWrite 方法的简单替代品,您只需要将 dynamoDB 实例作为第一个参数传递,事情应该可以正常工作: https://www.npmjs.com/package/batch-write-all

检查项目自述文件中的示例:

// Use bellow instead of this: dynamodb.batchWrite(params).promise();
batchWriteAll(dynamodb, params).promise();