我无法通过提供Hadoop配置来启动EMR版本 5.11.0 (AWS JAVA SDK版本 1.11.221 )的集群。
但是,每当省略外部Hadoop配置时(删除 .withConfigurations 方法,群集就会成功启动。我需要启动提供配置的群集。
启动集群的代码:
RunJobFlowRequest request = new RunJobFlowRequest()
.withConfigurations(prepareConfigurations(element.getAsJsonObject()))
.withName("EMR_PROCESSING__20190201")
.withReleaseLabel("emr-5.11.0")
.withSteps("mySteps")
.withApplications(new Application().withName("Hadoop"),
new Application().withName("Ganglia"),
new Application().withName("Spark"))
.withLogUri("s3://myS3P")
.withServiceRole("myDefaultRole")
.withJobFlowRole("myDefaultRole")
.withVisibleToAllUsers(true)
.withSecurityConfiguration("myConfigs")
.withSteps(new StepConfig().withName("Enable debugging")
.withActionOnFailure(ActionOnFailure.TERMINATE_CLUSTER)
.withHadoopJarStep(new StepFactory().newEnableDebuggingStep()))
.withInstances(new JobFlowInstancesConfig()
.withEc2KeyName("myKeyName")
.withEc2SubnetId("subnet-**")
.withInstanceCount(2)
.withKeepJobFlowAliveWhenNoSteps(true)
.withMasterInstanceType("r3.xlarge")
.withSlaveInstanceType("c4.8xlarge"));
RunJobFlowResult result = emr.runJobFlow(request);
System.out.println("Cluster launch ::: " + result.getJobFlowId());
我正在使用以下JSON作为Hadoop配置。
[{
"classification": "core-site",
"properties": {
"fs.s3a.access.key": "********",
"fs.s3.awsAccessKeyId": "********",
"fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
"hadoop.proxyuser.mapred.hosts": "*",
"hadoop.proxyuser.mapred.groups": "*",
"io.compression.codec.lzo.class": "com.hadoop.compression.lzo.LzoCodec",
"fs.s3.awsSecretAccessKey": "********",
"io.compression.codecs": "com.hadoop.compression.lzo.LzoCodec",
"fs.s3a.buffer.dir": "${hadoop.tmp.dir}/s3a",
"fs.s3a.secret.key": "********"
},
"configurations": []
}, {
"classification": "mapred-site",
"properties": {
"mapreduce.reduce.shuffle.parallelcopies": "20",
"mapreduce.task.io.sort.mb": "512",
"mapreduce.tasktracker.reduce.tasks.maximum": "10",
"mapreduce.map.speculative": "false",
"mapreduce.output.fileoutputformat.compress": "true",
"mapreduce.output.fileoutputformat.compress.codec": "com.hadoop.compression.lzo.LzoCodec",
"mapred.child.java.opts": "-Xmx3500m",
"mapreduce.job.reduce.slowstart.completedmaps": "0.99",
"mapreduce.tasktracker.map.tasks.maximum": "13",
"mapreduce.task.io.sort.factor": "48",
"mapreduce.reduce.java.opts": "-Xmx4500m",
"mapreduce.map.memory.mb": "4096",
"mapreduce.map.output.compress.codec": "com.hadoop.compression.lzo.LzoCodec",
"mapreduce.job.reduces": "80",
"yarn.app.mapreduce.am.command-opts": "-Xmx2000m",
"mapreduce.reduce.memory.mb": "5120",
"mapreduce.map.java.opts": "-Xmx3800m",
"mapreduce.reduce.speculative": "false",
"yarn.app.mapreduce.am.resource.mb": "2048"
},
"configurations": []
}, {
"classification": "yarn-site",
"properties": {
"yarn.nodemanager.aux-services": "mapreduce_shuffle,spark_shuffle",
"yarn.nodemanager.resource.cpu-vcores": "36",
"yarn.nodemanager.resource.memory-mb": "57344",
"yarn.application.classpath": "$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*,/data/cascading/lib/*,/usr/lib/hadoop-lzo/lib/*,/usr/share/aws/emr/emrfs/conf,/usr/share/aws/emr/emrfs/lib/*,/usr/share/aws/emr/emrfs/auxlib/*,/usr/share/aws/emr/lib/*,/usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,/usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,/usr/share/aws/emr/kinesis/lib/emr-kinesis-hadoop.jar,/usr/share/aws/emr/cloudwatch-sink/lib/*",
"yarn.scheduler.maximum-allocation-vcores": "36",
"yarn.scheduler.maximum-allocation-mb": "57344",
"yarn.scheduler.minimum-allocation-mb": "512",
"yarn.nodemanager.aux-services.spark_shuffle.class": "org.apache.spark.network.yarn.YarnShuffleService"
},
"configurations": []
}, {
"classification": "hdfs-site",
"properties": {
"dfs.blocksize": "134217728"
},
"configurations": []
}, {
"classification": "capacity-scheduler",
"properties": {
"yarn.scheduler.capacity.root.acl_submit_applications": "hadoop,yarn,mapred,hdfs",
"yarn.scheduler.capacity.root.queues": "default",
"yarn.scheduler.capacity.root.default.acl_submit_applications": "hadoop,yarn,mapred,hdfs",
"yarn.scheduler.capacity.root.default.capacity": "100",
"yarn.scheduler.capacity.root.default.state": "RUNNING"
},
"configurations": []
}, {
"classification": "hadoop-env",
"properties": {},
"configurations": [{
"classification": "export",
"properties": {
"HADOOP_CLASSPATH": "\"${HADOOP_CLASSPATH}:/home/hadoop/.driven-plugin/:/data/cascading/lib/*\""
},
"configurations": []
}]
}, {
"classification": "yarn-env",
"properties": {},
"configurations": [{
"classification": "export",
"properties": {
"YARN_USER_CLASSPATH": "\"${YARN_USER_CLASSPATH}:/home/hadoop/.driven-plugin/\""
},
"configurations": []
}]
}, {
"classification": "spark-defaults",
"properties": {
"spark.executor.memory": "8G",
"spark.driver.memory": "10G",
"spark.executor.cores": "5",
"spark.executor.instances": "49"
},
"configurations": []
}]
我使用相同的配置和代码来启动EMR集群(版本5.0.0 和 AWS-JAVA-SDK-1.11.39 )。升级给我带来了问题。
我是否提供了错误的配置/版本以启动5.11.0集群,或者我在这里错过了一些东西?