我已经设置了一个多节点德鲁伊群集: 1)作为协调者和霸主运行的1个节点(m4.xl) 2)2个节点,每个节点都运行历史和中间管理器。 (r3.2xl) 3)1个节点运行代理(r3.2xl)
现在我有一个EMR集群正在运行,我想用它来完成摄取任务,问题是每当我尝试通过CURL命令提交作业时,作业总是以两个中间管理器中的本地hadoop作业开始而不是提交到远程EMR集群。我的数据位于S3,S3也配置为深度存储。
我还将EMR大师的所有罐子复制到hadoop-dependencies/hadoop-client/2.7.3/
德鲁伊版:0.9.2 EMR版本:5.2
请查找附加的索引作业,公共运行时属性和中间管理器运行时属性。
文件:data_index.json
{
"type": "index_hadoop",
"spec": {
"ioConfig": {
"type": "hadoop",
"inputSpec": {
"type": "static",
"paths": "s3n://<kjcnskd>smallTest"
}
},
"dataSchema": {
"dataSource": "multi_value_test_01",
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "day",
"queryGranularity": "none",
"intervals": [
"2011-09-12/2017-09-13"
]
},
"parser": {
"type": "string",
"parseSpec": {
"format": "tsv",
"delimiter": "\u0001",
"listDelimiter": "|",
"columns": [
"article_type",
"brand",
"gender",
"brand_type",
"master_category",
"supply_type",
"business_unit",
"testdim",
"date",
"week",
"month",
"year",
"style_id",
"live_styles",
"non_live_styles",
"broken_style",
"new_season_styles",
"live_styles_qty",
"non_live_styles_qty",
"broken_style_qty",
"new_season_styles_qty"
],
"dimensionsSpec": {
"dimensions": [
"article_type",
"brand",
"gender",
"brand_type",
"master_category",
"supply_type",
"business_unit",
"testdim",
"week",
"month",
"year",
"style_id"
]
},
"timestampSpec": {
"column": "date",
"format": "yyyyMMdd"
}
}
},
"metricsSpec": [
{
"name": "live_styles",
"type": "doubleSum",
"fieldName": "live_styles"
},
{
"name": "non_live_styles",
"type": "doubleSum",
"fieldName": "non_live_styles"
},
{
"name": "broken_style",
"type": "doubleSum",
"fieldName": "broken_style"
},
{
"name": "new_season_styles",
"type": "doubleSum",
"fieldName": "new_season_styles"
},
{
"name": "live_styles_qty",
"type": "doubleSum",
"fieldName": "live_styles_qty"
},
{
"name": "broken_style_qty",
"type": "doubleSum",
"fieldName": "broken_style_qty"
},
{
"name": "new_season_styles_qty",
"type": "doubleSum",
"fieldName": "new_season_styles_qty"
}
]
},
"tuningConfig": {
"type": "hadoop",
"partitionsSpec": {
"type": "hashed",
"targetPartitionSize": 5000000
},
"jobProperties": {
"fs.s3.awsAccessKeyId": "XXXXXXXXXXXXXX",
"fs.s3.awsSecretAccessKey": "XXXXXXXXXXXXXX",
"fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"fs.s3n.awsAccessKeyId": "XXXXXXXXXXXXXX",
"fs.s3n.awsSecretAccessKey": "XXXXXXXXXXXXXX",
"fs.s3n.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
"io.compression.codecs": "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec"
}
}
}
}
&#13;
文件:common.runtime.properties
#
# Licensed to Metamarkets Group Inc. (Metamarkets) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. Metamarkets licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#
# Extensions
#
# This is not the full list of Druid extensions, but common ones that people often use. You may need to change this list
# based on your particular setup.
druid.extensions.loadList=["druid-kafka-eight", "druid-s3-extensions", "druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "mysql-metadata-storage"]
# If you have a different version of Hadoop, place your Hadoop client jar files in your hadoop-dependencies directory
# and uncomment the line below to point to your directory.
druid.extensions.hadoopDependenciesDir=hadoop-dependencies/hadoop-client/2.7.3
#
# Logging
#
# Log all runtime properties on startup. Disable to avoid logging properties on startup:
druid.startup.logging.logProperties=true
#
# Zookeeper
#
druid.zk.service.host=10.0.1.152
druid.zk.paths.base=/druid
#
# Metadata storage
#
# For Derby server on your Druid Coordinator (only viable in a cluster with a single Coordinator, no fail-over):
#druid.metadata.storage.type=derby
#druid.metadata.storage.connector.connectURI=jdbc:derby://metadata.store.ip:1527/var/druid/metadata.db;create=true
#druid.metadata.storage.connector.host=metadata.store.ip
#druid.metadata.storage.connector.port=1527
# For MySQL:
druid.metadata.storage.type=mysql
druid.metadata.storage.connector.connectURI=jdbc:mysql://10.0.1.140:3306/druid
druid.metadata.storage.connector.user=druid
druid.metadata.storage.connector.password=druid123
# For PostgreSQL (make sure to additionally include the Postgres extension):
#druid.metadata.storage.type=postgresql
#druid.metadata.storage.connector.connectURI=jdbc:postgresql://db.example.com:5432/druid
#druid.metadata.storage.connector.user=...
#druid.metadata.storage.connector.password=...
#
# Deep storage
#
# For local disk (only viable in a cluster if this is a network mount):
#druid.storage.type=local
#druid.storage.storageDirectory=var/druid/segments
# For HDFS (make sure to include the HDFS extension and that your Hadoop config files in the cp):
#druid.storage.type=hdfs
#druid.storage.storageDirectory=/druid/segments
# For S3:
druid.storage.type=s3
druid.storage.bucket=asfvdcs
druid.storage.baseKey=druid/segments
druid.s3.accessKey=XXXXXXXXXXXX
druid.s3.secretKey=XXXXXXXXXXXX
#
# Indexing service logs
#
# For local disk (only viable in a cluster if this is a network mount):
druid.indexer.logs.type=file
druid.indexer.logs.directory=var/druid/indexing-logs
# For HDFS (make sure to include the HDFS extension and that your Hadoop config files in the cp):
#druid.indexer.logs.type=hdfs
#druid.indexer.logs.directory=/druid/indexing-logs
# For S3:
#druid.indexer.logs.type=s3
#druid.indexer.logs.s3Bucket=testashutosh
#druid.indexer.logs.s3Prefix=druid/indexing-logs
#
# Service discovery
#
druid.selectors.indexing.serviceName=druid/overlord
druid.selectors.coordinator.serviceName=druid/coordinator
#
# Monitoring
#
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor"]
druid.emitter=logging
druid.emitter.logging.logLevel=info
&#13;
文件:中间管理器runtime.properties
druid.service=druid/middleManager
druid.port=8091
# Number of tasks per middleManager
druid.worker.capacity=3
# Task launch parameters
druid.indexer.runner.javaOpts=-server -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.indexer.task.baseTaskDir=var/druid/task
# HTTP server threads
druid.server.http.numThreads=25
# Processing threads and buffers
druid.processing.buffer.sizeBytes=536870912
druid.processing.numThreads=2
# Hadoop indexing
druid.indexer.task.hadoopWorkingPath=hdfs://ip-10-0-1-xxx.ap-southeast-1.compute.internal:8020/tmp/druid-indexing
druid.indexer.task.defaultHadoopCoordinates=["org.apache.hadoop:hadoop-client:2.7.3"]
druid.indexer.runner.type=remote
&#13;
答案 0 :(得分:0)
你需要告诉德鲁伊关于Hadoop集群的事情。引用manual:
将您的Hadoop配置XML(core-site.xml,hdfs-site.xml,yarn-site.xml,mapred-site.xml)放在Druid节点的类路径上。您可以将它们复制到conf / druid / _common / core-site.xml,conf / druid / _common / hdfs-site.xml等等。
如果您已经这样做了,那么就会发现其中一个配置文件存在问题(发生在我身上)。