DynamoDB表通过数据管道复制产生不完整的重复

时间:2015-12-02 16:13:29

标签: amazon-web-services amazon-dynamodb amazon-data-pipeline

我有一个14.05GB的DynamoDB表,有140,000,000个项目。我正在尝试使用数据管道克隆它(到同一区域),但是当管道完成时目标表只有大约160,000个项目,我等待6个小时来查看项目计数。

我为每个表设置吞吐量为256,管道大约需要20分钟才能完成。有没有什么可能导致管道只复制表的一部分?尺寸和项目数量是否有无形限制?我已经尝试了3次这样的结果,每次完成'目的地表只包含90-150k的140M项目。

我还确保最大执行时间设置得非常高。

数据管道是快速复制Dynamo表的最简单方法吗?

感谢。

2 个答案:

答案 0 :(得分:1)

亚马逊已回复我的机票并确认这是数据管道中的已知问题(错误)。

他们建议我使用这个Java程序https://github.com/awslabs/dynamodb-import-export-tool首先将它导出到S3,然后将其导回到DynamoDB

答案 1 :(得分:-1)

使用AWS Data Pipeline的EmrActivity,可以从一个Dynamodb表复制到另一个。下面是一个示例管道定义。

{
"objects": [
    {
        "startAt": "FIRST_ACTIVATION_DATE_TIME",
        "name": "DailySchedule",
        "id": "DailySchedule",
        "period": "1 day",
        "type": "Schedule",
        "occurrences": "1"
    },
    {
        "id": "Default",
        "name": "Default",
        "scheduleType": "CRON",
        "pipelineLogUri": "#{myS3LogsPath}",
        "schedule": {
            "ref": "DailySchedule"
        },
        "failureAndRerunMode": "CASCADE",
        "role": "DataPipelineDefaultRole",
        "resourceRole": "DataPipelineDefaultResourceRole"
    },
   {
        "id": "DDBSourceTable",
        "tableName": "#{myDDBSourceTableName}",
        "name": "DDBSourceTable",
        "type": "DynamoDBDataNode",
        "readThroughputPercent": "#{myDDBReadThroughputRatio}"
    },
    {
        "name": "S3TempLocation",
        "id": "S3TempLocation",
        "type": "S3DataNode",
        "directoryPath": "#{myTempS3Folder}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}"
    },
    {
        "id": "DDBDestinationTable",
        "tableName": "#{myDDBDestinationTableName}",
        "name": "DDBDestinationTable",
        "type": "DynamoDBDataNode",
        "writeThroughputPercent": "#{myDDBWriteThroughputRatio}"
    },
    {
        "id": "EmrClusterForBackup",
        "name": "EmrClusterForBackup",
        "releaseLabel": "emr-4.2.0",
        "masterInstanceType": "m3.xlarge",
        "coreInstanceType": "m3.xlarge",
        "coreInstanceCount": "1",
        "region": "#{myDDBSourceRegion}",
        "terminateAfter": "6 Hours",
        "type": "EmrCluster"
    },
    {
        "id": "EmrClusterForLoad",
        "name": "EmrClusterForLoad",
        "releaseLabel": "emr-4.2.0",
        "masterInstanceType": "m3.xlarge",
        "coreInstanceType": "m3.xlarge",
        "coreInstanceCount": "1",
        "region": "#{myDDBDestinationRegion}",
        "terminateAfter": "6 Hours",
        "type": "EmrCluster"
    },
    {
        "id": "TableLoadActivity",
        "name": "TableLoadActivity",
        "runsOn": {
            "ref": "EmrClusterForLoad"
        },
        "input": {
            "ref": "S3TempLocation"
        },
        "output": {
            "ref": "DDBDestinationTable"
        },
        "type": "EmrActivity",
        "maximumRetries": "2",
        "dependsOn": {
           "ref": "TableBackupActivity"
        },
        "resizeClusterBeforeRunning": "true",
        "step": [
            "s3://dynamodb-emr-#{myDDBDestinationRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbImport,#{input.directoryPath},#{output.tableName},#{output.writeThroughputPercent}"
        ]
    },
   {
        "id": "TableBackupActivity",
        "name": "TableBackupActivity",
        "input": {
            "ref": "DDBSourceTable"
        },
        "output": {
            "ref": "S3TempLocation"
        },
        "runsOn": {
            "ref": "EmrClusterForBackup"
        },
        "resizeClusterBeforeRunning": "true",
        "type": "EmrActivity",
        "maximumRetries": "2",
        "step": [
            "s3://dynamodb-emr-#{myDDBSourceRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}"
        ]
    },
    {
        "dependsOn": {
            "ref": "TableLoadActivity"
        },
        "name": "S3CleanupActivity",
        "id": "S3CleanupActivity",
        "input": {
            "ref": "S3TempLocation"
        },
        "runsOn": {
           "ref": "EmrClusterForBackup"
        },
        "type": "ShellCommandActivity",
        "command": "(sudo yum -y update aws-cli) && (aws s3 rm #{input.directoryPath} --recursive)"
    }
],
"parameters": [
    {
        "myComment": "This Parameter specifies the S3 logging path for the pipeline.  It is used by the 'Default' object to set the 'pipelineLogUri' value.",
        "id" : "myS3LogsPath",
        "type" : "AWS::S3::ObjectKey",
        "description" : "S3 path for pipeline logs."
    },
    {
        "id": "myDDBSourceTableName",
        "type": "String",
        "description": "Source DynamoDB table name"
    },
    {
        "id": "myDDBDestinationTableName",
        "type": "String",
        "description": "Target DynamoDB table name"
    },
    {
        "id": "myDDBWriteThroughputRatio",
        "type": "Double",
        "description": "DynamoDB write throughput ratio",
        "default": "0.25",
        "watermark": "Enter value between 0.1-1.0"
    },
    {
        "id": "myDDBSourceRegion",
        "type": "String",
        "description": "Region of the DynamoDB table",
        "default": "us-east-1",
        "watermark": "us-east-1"
    },
    {
        "id": "myDDBDestinationRegion",
        "type": "String",
        "description": "Region of the DynamoDB table",
        "default": "us-east-1",
        "watermark": "us-east-1"
    },
    {
        "id": "myDDBReadThroughputRatio",
        "type": "Double",
        "description": "DynamoDB read throughput ratio",
        "default": "0.25",
        "watermark": "Enter value between 0.1-1.0"
    },
    {
        "myComment": "Temporary S3 path to store the dynamodb backup csv files, backup files will be deleted after the copy completes",
        "id": "myTempS3Folder",
        "type": "AWS::S3::ObjectKey",
        "description": "Temporary S3 folder"
    }
]
}