Question

目前，日志存储在DynamoDB中。我们希望从该表中过滤掉不必要的行并将输出存储在不同的表中（fe排除＆＃34;值＆＃34;字段包含＆＃34; bot＆＃34;，＆＃34; python＆＃34;，＆＃34;请求＆＃34;等）。

到这时我想出了类似的东西（aws模板）：

{
  "objects": [
    {
      "name": "EmrClusterForBackup",
      "coreInstanceType": "m1.medium",
      "coreInstanceCount": "1",
      "masterInstanceType": "m1.medium",
      "amiVersion": "3.3.2",
      "id": "EmrClusterForBackup",
      "type": "EmrCluster",
      "terminateAfter": "2 Hours"
    },
    {
      "occurrences": "1",
      "period": "1 Day",
      "name": "RunOnce",
      "id": "DefaultSchedule",
      "type": "Schedule",
      "startAt": "FIRST_ACTIVATION_DATE_TIME"
    },
    {
      "name": "DDBExportFormat",
      "id": "DDBExportFormat",
      "type": "DynamoDBExportDataFormat"
    },
    {
      "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
      "dataFormat": {
        "ref": "DDBExportFormat"
      },
      "name": "S3BackupLocation",
      "id": "S3BackupLocation",
      "type": "S3DataNode"
    },
    {
      "failureAndRerunMode": "CASCADE",
      "schedule": {
        "ref": "DefaultSchedule"
      },
      "resourceRole": "DataPipelineDefaultResourceRole",
      "role": "DataPipelineDefaultRole",
      "pipelineLogUri": "s3://ti-labs-ml-data/logs/",
      "scheduleType": "cron",
      "name": "Default",
      "id": "Default"
    },
    {
      "output": {
        "ref": "S3BackupLocation"
      },
      "input": {
        "ref": "DDBSourceTable"
      },
      "filterSql": "",
      "name": "TableBackupActivity",
      "id": "TableBackupActivity",
      "runsOn": {
        "ref": "EmrClusterForBackup"
      },
      "type": "HiveCopyActivity",
      "resizeClusterBeforeRunning": "true"
    },
    {
      "readThroughputPercent": "#{myDDBReadThroughputRatio}",
      "dataFormat": {
        "ref": "DDBExportFormat"
      },
      "name": "DDBSourceTable",
      "id": "DDBSourceTable",
      "type": "DynamoDBDataNode",
      "tableName": "#{myDDBTableName}"
    }
  ],
  "parameters": [
    {
      "description": "Output S3 folder",
      "id": "myOutputS3Loc",
      "type": "AWS::S3::ObjectKey"
    },
    {
      "default": "0.2",
      "watermark": "Enter value between 0.1-1.0",
      "description": "DynamoDB read throughput ratio",
      "id": "myDDBReadThroughputRatio",
      "type": "Double"
    },
    {
      "description": "DynamoDB table name",
      "id": "myDDBTableName",
      "type": "String"
    }
  ],
  "values": {
    "myDDBTableName": "TI-LABS-DDB-A",
    "myDDBReadThroughputRatio": "0.2",
    "myOutputS3Loc": "s3://ti-labs-ml-data/"
  }
}

但是，我不明白过滤器查询应该是什么样的（我已经尝试了一个 - 它说行有＆＃34;项目＆＃34;只有，而我的表有2个字段 - id和价值）。

Answer 1

filterSql对应于sql查询中的where条件。所以你的情况下的filterSql看起来像

value not in ('bot', 'python', 'requests')

请参阅http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-hivecopyactivity.html

如何使用Amazon Data Pipeline和Hive过滤掉DynamoDB中的数据？

1 个答案: