当我尝试使用大数据集运行spark作业时,我一直得到ExitCodeException exitCode = 1。
*array_val
我的配置:
Exit code: 1
Stack trace: ExitCodeException exitCode=1:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:545)
at org.apache.hadoop.util.Shell.run(Shell.java:456)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:722)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:211)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:302)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 1
)] in 1 attempts
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 seconds]. This timeout is controlled by spark.rpc.askTimeout
at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.removeExecutor(CoarseGrainedSchedulerBackend.scala:359)
at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$receive$1.applyOrElse(YarnSchedulerBackend.scala:176)
at org.apache.spark.rpc.netty.Inbox$$anonfun$process$1.apply$mcV$sp(Inbox.scala:116)
at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:204)
at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100)
at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:215)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 11 more
我有一个df = [CUSTOMER_ID,itemType,eventTimeStamp,valueType,value]
30 Slaves of r3.8xLarge
spark.driver.cores 30
spark.driver.memory 200g
spark.executor.cores 7
spark.executor.instances 240
spark.executor.memory 40g
spark.memory.fraction 0.7
我想得到的结果为:
+-------------+-----------+-------------+--------+--------------------+
| CUSTOMER_ID | itemType | valueType | value | eventTimeStamp |
+-------------+-----------+--------+---------------------------------+
| 1 | rent | dvd | 12 |2016-09-19T00:00:00Z
| 1 | rent | dvd | 12 |2016-09-19T00:00:00Z
| 1 | buy | tv | 12 |2016-09-20T00:00:00Z
| 1 | rent | movie | 12 |2016-09-20T00:00:00Z
| 1 | buy | movie | 12 |2016-09-18T00:00:00Z
| 1 | buy | movie | 12 |2016-09-18T00:00:00Z
+-------------+-----------+-------------+--------+---------------------+
我的代码:
CUSTOMER_ID : 1
totalValue : 72 --- group by based on id
itemTypeMap : {"rent" : 3, "buy" : 3} --- group by based on id
valueTypeMap : {"dvd" : 2, "tv" : 1, "movie" : 3 } --- group by based on id
itemTypeForDay : {"rent: 2, "buy" : 2 } --- group By based on id and dayofmonth(col("eventTimeStamp")) atmost 1 type per day
Util类:
val temp = df.groupBy("CUSTOMER_ID").agg(
collectAsList(df("itemType")).alias("itemCount"),
collectAsList(df("valueType")).alias("valueTypeCount"),
sum("value") as "totalValues")
val stage1 = temp.withColumn("valueTypeMap", count_by_value(col("valueTypeCount")))
.withColumn("itemTypeMap", count_by_value(col("itemCount")))
.drop("itemCount")
.drop("valueTypeCount")
val toMap = udf { (typ: String, count: Int) => Map(typ -> count) }
val count_by_value = udf {( value :scala.collection.mutable.WrappedArray[String]) => if (value == null) null else value.groupBy(identity).mapValues(_.size)}
val collectAsList = new CollectListFunction(StringType)
import org.apache.spark.sql.functions.{dayofmonth, countDistinct}
val stage2 = df.groupBy("CUSTOMER_ID", "itemType")
.agg(countDistinct(dayofmonth(col("eventTimeStamp"))) as "daysPeritemType")
.withColumn("itemTypeForDay", toMap(col("itemType"), col("daysPeritemType")))
.groupBy("CUSTOMER_ID").agg(CombineMaps(col("itemTypeForDay")) as "resultMap")
val result = stage1.join(stage2, stage1("CUSTOMER_ID") === stage2("CUSTOMER_ID"))
.drop(stage2("CUSTOMER_ID"))
TestPairFucntion.saveAsTempFile(convertToRDD(result))
有人可以告诉我这里我做错了吗?