使用Spark sql代码,我在spark中读取1373 MB的csv文件,并将其转换为数据帧并创建多个临时表。每个Temp表都是从先前的临时表中派生出来的。
当我使用spark submit执行spark应用程序时,我看到创建了一个作业,但是当我进入Stages时,我看到两个阶段正在读取相同的输入文件。为什么会这样?
我只有一个动作。为什么同一个输入文件在不同的阶段被spark读取两次? 如您所见,在阶段0和阶段2中执行了11个具有相同输入文件大小的任务。
Spark Code:只有部分代码在这里。
object OmegaETL extends BaseETL {
def main(args: Array[String]): Unit = {
val omega_hdfs_input_loc ="/user/cloudera/data/inputfiles/Omega_events.csv"
val sc = getSparkContext(jobName)
val sqlContext = getHiveContext(sc)
val omegaRDD = getBaseRDD(sc,omega_hdfs_input_loc)
val OmegaoutputDF = doOmegaProcess(sc,sqlContext,omegaRDD)
OmegaoutputDF.registerTempTable("Omega_processed")
sqlContext.sql("SELECT * FROM Omega_processed LIMIT 20 ").show(20)
}
def doOmegaProcess(sc:SparkContext, sqlContext: HiveContext, OmegaRDD:RDD[Array[String]]): DataFrame = {
import sqlContext.implicits._
log.info("doOmegaProcess method started")
val schemaRDD = OmegaRDD.map(arr => new Omega(arr(0),arr(1),arr(2),arr(3),arr(4),
arr(5),arr(6),arr(7),arr(8),arr(9)))
val OmegaDF = schemaRDD.toDF()
val Omega_filterDF = OmegaDF.filter($"scheme_name".isNotNull && length(trim($"scheme_name")) > 0)
Omega_filterDF.registerTempTable("Omega_filtered")
//sqlContext.sql("SELECT * from Omega_filtered LIMIT 10").show(5)
val latestEventDF = getLatestEvent(sc,sqlContext,Omega_filterDF)// This line does further transformation on Omega_filterDF alone.
latestEventDF.registerTempTable("Omega_latest_event")
//sqlContext.sql("SELECT * from Omega_latest_event LIMIT 10").show(10)
val OmegaoutputDF = sqlContext.sql("""SELECT *
|FROM Omega_filtered
|WHERE TRIM(event_type) ='ACTION')Omega_events
|INNER JOIN
|Omega_latest_event
|ON(Omega_events.data_id= Omega_latest_event.data_idAND Omega_events.conversion_base_type = Omega_latest_event.conversion_base_type)
|""".stripMargin)// In this query I am applying union on two temp tables. Not including the whole query in question as it is big
log.info("doOmegaProcess method ended")
OmegaoutputDF
}
}
class BaseETL {
def getBaseRDD(sc:SparkContext,fileLoc:String):RDD[String]={
sc.textFile(fileLoc)
}
}
有人可以帮我解释为什么同一个文件被spark读取两次?如何克服这个?
是否因为我正在创建多个临时表而发生?