我正在尝试从hdfs中的hive orc压缩表数据中将数据提取到druid中。任何关于此的指示都会非常有用。
答案 0 :(得分:0)
假设您已经设置了Druid和Yarn / MapReduce,您可以启动index_hadoop任务,该任务将执行您的要求。
有一个允许读取ORC文件的德鲁伊 - orc扩展,我不认为它带有标准版本,所以你必须以某种方式得到它(我们从源代码构建它)
(扩展名列表http://druid.io/docs/latest/development/extensions.html)
这里有一个示例,它将获取一堆orc文件并将间隔附加到数据源。 POST给一个霸主http:// 霸主:8090 / druid / indexer / v1 / task
(doc http://druid.io/docs/latest/ingestion/batch-ingestion.html)
您可能需要根据您的发行版进行调整,我记得我们在hortonworks上发现了一些未找到类的问题(classpathPrefix将帮助调整MapReduce类路径)
{
"type": "index_hadoop",
"spec": {
"ioConfig": {
"type": "hadoop",
"inputSpec": {
"type": "granularity",
"inputFormat": "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat",
"dataGranularity": "hour",
"inputPath": "/apps/hive/warehouse/table1",
"filePattern": ".*",
"pathFormat": "'partition='yyyy-MM-dd'T'HH"
}
},
"dataSchema": {
"dataSource": "cube_indexed_from_orc",
"parser": {
"type": "orc",
"parseSpec": {
"format": "timeAndDims",
"timestampSpec": {
"column": "timestamp",
"format": "nano"
},
"dimensionsSpec": {
"dimensions": ["cola", "colb", "colc"],
"dimensionExclusions": [],
"spatialDimensions": []
}
},
"typeString": "struct<timestamp:bigint,cola:bigint,colb:string,colc:string,cold:bigint>"
},
"metricsSpec": [{
"type": "count",
"name": "count"
}],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "HOUR",
"intervals": ["2017-06-14T00:00:00.000Z/2017-06-15T00:00:00.000Z"]
}
},
"tuningConfig": {
"type": "hadoop",
"partitionsSpec": {
"type": "hashed",
"targetPartitionSize": 5000000
},
"leaveIntermediate": false,
"forceExtendableShardSpecs": "true"
}
}
}