这是我的目的,将mysql表(超过5000万行)读取到hdfs。
火花版本:3.0.3
object DBHelper {
def setConnectionProperty(): Properties = {
val connProp = new Properties
connProp.put("driver", "com.mysql.cj.jdbc.Driver")
connProp.put("user", System.getProperty("db_user"))
connProp.put("password", System.getProperty("db_password"))
connProp
}
}
当我提交spark任务(主纱线-部署模式客户端)并在main中调用方法“ loadUserVerifyData”并读取1000(或更小)时,花了20m +并仍然抛出异常“ org.apache.spark.SparkException :作业由于阶段故障而中止:“
def loadUserVerifyData(spark: SparkSession, args: Array[String]): Unit = {
val tableName = args(0)
//分区数
val partitionNum = args(1).toInt
val idRange = CMOrderLog.getIDRange(tableName, URL)
//ID值起始范围
val start = idRange.get(0)
val end = idRange.get(0)
val originDF = spark.read.jdbc(URL, tableName, "id", start, end, partitionNum, DBHelper.setConnectionProperty)
originDF.createOrReplaceTempView(tableName)
val context = originDF.sqlContext
context.udf.register("e_name", (str: String) => StringUtils.encrypt(str, 1))
context.udf.register("e_card", (str: String) => StringUtils.encrypt(str))
val sql = "SELECT t.*, e_name(t.auth_realname) as real_name, e_card(t.auth_person_card) as id_card FROM " + tableName + " t"
val finalDF = context.sql(sql).drop("auth_realname", "auth_person_card")
finalDF.write.mode(SaveMode.Overwrite).parquet(OUTPUT + tableName)
spark.stop()
}
异常日志:
2018-08-26 16:15:02 INFO DAGScheduler:54 - ResultStage 0 (parquet at ReadDb2HDFS.scala:288) failed in 1008.933 s due to Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, master, executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 130349 ms
Driver stacktrace:
2018-08-26 16:15:02 INFO DAGScheduler:54 - Job 0 failed: parquet at ReadDb2HDFS.scala:288, took 1008.977605 s
2018-08-26 16:15:02 ERROR FileFormatWriter:91 - Aborting job null.
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, master, executor 4): ExecutorLostFailure (executor 4 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 130349 ms
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:194)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:154)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:547)
at com.cm.data.datasync.ReadDb2HDFS$.loadUserBaseData(ReadDb2HDFS.scala:288)
at com.cm.data.datasync.ReadDb2HDFS$.main(ReadDb2HDFS.scala:55)
at com.cm.data.datasync.ReadDb2HDFS.main(ReadDb2HDFS.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:879)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
答案 0 :(得分:0)
添加此属性connProp.put(“ fetchsize”,“ 10000”)和此火花配置的工作--driver-memory 4G --num-executors 8 --executor-cores 1 --executor-memory 4G,传递参数start = 0;结束= 8; partitionNum = 8并在发生任何异常时向我提供日志。