目前,日志存储在DynamoDB中。我们希望从该表中过滤掉不必要的行并将输出存储在不同的表中(fe排除"值"字段包含" bot"," python",& #34;请求"等)。
到这时我想出了类似的东西(aws模板):
{
"objects": [
{
"name": "EmrClusterForBackup",
"coreInstanceType": "m1.medium",
"coreInstanceCount": "1",
"masterInstanceType": "m1.medium",
"amiVersion": "3.3.2",
"id": "EmrClusterForBackup",
"type": "EmrCluster",
"terminateAfter": "2 Hours"
},
{
"occurrences": "1",
"period": "1 Day",
"name": "RunOnce",
"id": "DefaultSchedule",
"type": "Schedule",
"startAt": "FIRST_ACTIVATION_DATE_TIME"
},
{
"name": "DDBExportFormat",
"id": "DDBExportFormat",
"type": "DynamoDBExportDataFormat"
},
{
"directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "S3BackupLocation",
"id": "S3BackupLocation",
"type": "S3DataNode"
},
{
"failureAndRerunMode": "CASCADE",
"schedule": {
"ref": "DefaultSchedule"
},
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://ti-labs-ml-data/logs/",
"scheduleType": "cron",
"name": "Default",
"id": "Default"
},
{
"output": {
"ref": "S3BackupLocation"
},
"input": {
"ref": "DDBSourceTable"
},
"filterSql": "",
"name": "TableBackupActivity",
"id": "TableBackupActivity",
"runsOn": {
"ref": "EmrClusterForBackup"
},
"type": "HiveCopyActivity",
"resizeClusterBeforeRunning": "true"
},
{
"readThroughputPercent": "#{myDDBReadThroughputRatio}",
"dataFormat": {
"ref": "DDBExportFormat"
},
"name": "DDBSourceTable",
"id": "DDBSourceTable",
"type": "DynamoDBDataNode",
"tableName": "#{myDDBTableName}"
}
],
"parameters": [
{
"description": "Output S3 folder",
"id": "myOutputS3Loc",
"type": "AWS::S3::ObjectKey"
},
{
"default": "0.2",
"watermark": "Enter value between 0.1-1.0",
"description": "DynamoDB read throughput ratio",
"id": "myDDBReadThroughputRatio",
"type": "Double"
},
{
"description": "DynamoDB table name",
"id": "myDDBTableName",
"type": "String"
}
],
"values": {
"myDDBTableName": "TI-LABS-DDB-A",
"myDDBReadThroughputRatio": "0.2",
"myOutputS3Loc": "s3://ti-labs-ml-data/"
}
}
但是,我不明白过滤器查询应该是什么样的(我已经尝试了一个 - 它说行有"项目"只有,而我的表有2个字段 - id和价值)。
答案 0 :(得分:-2)
filterSql对应于sql查询中的where条件。所以你的情况下的filterSql看起来像
value not in ('bot', 'python', 'requests')
请参阅http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-hivecopyactivity.html