我想创建一个按需集群,并将Spark-scala jars提交给该集群,并在完成后终止该集群。在AWS中使用command-runner.jar非常简单 如何为Azure HDInsight实现这一目标?
我不需要任何手动操作或通过Azure UI。
答案 0 :(得分:0)
几个月前,我有同样的要求,并制作了以下PowerShell脚本,该脚本调用PuTTY上传JAR和配置文件,然后发送spark-submit命令。
cls
$PuttyBasePath = "C:\Program Files\PuTTY\"
$PlinkPath = $PuttyBasePath + "plink.exe"
$PscpPath = $PuttyBasePath + "pscp.exe"
$SshHost = "someCluster-ssh.azurehdinsight.net"
$SshUserName = "sshuser"
$SshPassword = "somePassword"
$MainPath = "/home/sshuser/"
$BaseDirectoryToUploadFiles = "/home/sshuser/testPath"
$DirectoryToUploadFiles = $BaseDirectoryToUploadFiles + "20180505235534"
# Files to load
$ConfigFilePath = "C:\config.json"
$JarFilePath = "C:\somejar.jar"
function UploadFiles
{
&$PlinkPath -v -pw $SshPassword $SshUserName@$SshHost mkdir -p $DirectoryToUploadFiles
$FilesToUpload = @($JarFilePath, $ConfigFilePath)
foreach ($FileToUpload in $FilesToUpload)
{
&$PscpPath -v -pw $SshPassword -r $FileToUpload $SshUserName@${SshHost}:$DirectoryToUploadFiles
}
}
function RunSparkJob
{
cls
echo y | &$PlinkPath -v -pw $SshPassword $SshUserName@$SshHost ("/usr/hdp/current/spark2-client/bin/spark-submit" +
" --master yarn" +
" --deploy-mode cluster" +
" --conf spark.yarn.maxAppAttempts=8" +
" --conf spark.yarn.am.attemptFailuresValidityInterval=1h" +
" --conf spark.yarn.max.executor.failures=240" +
" --conf spark.yarn.executor.failuresValidityInterval=1h" +
" --conf spark.task.maxFailures=4" +
" --conf spark.speculation=true" +
" --conf spark.speculation.multiplier=40" +
" --conf spark.speculation.quantile=0.85" +
" --num-executors 30" +
" --executor-cores 10" +
" --files ${DirectoryToUploadFiles}/config.json" +
" --packages org.apache.spark:spark-streaming_2.11:2.2.0,com.microsoft.azure:azure-eventhubs-spark_2.11:2.2.0" +
" --class com.test.someClass ${DirectoryToUploadFiles}/somejar.jar")
}
UploadFiles
RunSparkJob