Spark Job创建了太多任务

时间:2018-03-15 10:47:36

标签: scala apache-spark spark-dataframe yarn cloudera

我正在Scala中开发一个代码以启动Cloudera集群。 我的代码是:

def func_segment (model: String) : String = {
    if(model == "A1" || model == "B1" || model == "C1" || model == "D1") "NAME1" 
    else if (model == "A2" || model == "B2") "NAME2"
    else "NAME3"
}
val func_segment_udf = udf((model: String) =>{
    if(model == "A1" || model == "B1" || model == "C1" || model == "D1") "NAME1" 
    else if (model == "A2" || model == "B2") "NAME2"
    else "NAME3"
})
def func_assetType (sp_rating : String) : String = {
    if(sp_rating.startsWith("A")) "B00001"
    else if(sp_rating.startsWith("BBB")) "B00002"
    else "B00003"
}
val func_assetType_udf = udf((sp_rating : String) => {
    if(sp_rating.endsWith("u")) func_assetType(sp_rating.dropRight(1))
    else func_assetType(sp_rating.dropRight(1))
})
val func_nctaFormat_udf = udf((ncta:Int) => {
    "%018d".format(ncta)
})
def func_nctaFormat (ncta:Int) : String = {
    "%018d".format(ncta)
}
def func_assetTypeTotal (datos : List[String]) : String = {
    if (datos(0) != null && datos(0) != "" && ! datos(0).startsWith("SIN") ) func_assetType(datos(0))
    else if (datos(1) != null && datos(1) != "") func_assetType(datos(2))
    else func_assetType(datos(4))
}

val repos_cpty = sqlContext.sql("""
SELECT * FROM bu_local.vw_cpty
""")

val repos_aqa = sqlContext.sql("""
SELECT * FROM bu_local.vw_aqa
""")
val repos_garan = sqlContext.sql("""
SELECT * FROM bu_local.vw_repos_garan
""")
val repos_rating_conversion = sqlContext.sql("""
select upper(segment) as segment, internal_rating, sp_rating from bu_local.vw_rating_conversion
""")

val internal_rat = repos_garan.join(repos_cpty, repos_garan.col("id_garante") <=> repos_cpty.col("cpty")).join(repos_aqa, repos_cpty.col("parent") <=> repos_aqa.col("gl_code")).withColumn("segment",func_segment_udf(col("model_id_3"))).select("id_emisor_garan","parentrating","segment").distinct()

val repos_aux2 = internal_rat.join(repos_rating_conversion, (internal_rat.col("segment") <=> repos_rating_conversion.col("segment")) && (internal_rat.col("parentrating") <=> repos_rating_conversion.col("internal_rating")))
.withColumn("asset_type_aux2", func_assetType_udf(col("sp_rating")))
.select("id_emisor_garan","asset_type_aux2").distinct()

val internal_rat2 = repos_garan.join(repos_cpty, repos_garan.col("id_garante") <=> repos_cpty.col("cpty")).join(repos_aqa, repos_cpty.col("parent") <=> repos_aqa.col("gl_code"))
.withColumn("segment",func_segment_udf(col("model_id_3")))
.select("id_garante","parentrating","segment").distinct()

val repos_aux1 = internal_rat2.join(repos_rating_conversion, (internal_rat2.col("segment") <=> repos_rating_conversion.col("segment")) && (internal_rat2.col("parentrating") <=> repos_rating_conversion.col("internal_rating")))
.withColumn("asset_type_aux1", func_assetType_udf(col("sp_rating")))
.select("id_garante","asset_type_aux1").distinct()

val repos_final = repos_garan.join(repos_aux2.withColumnRenamed("id_emisor_garan","id_emisor_garan_aux2"), repos_garan.col("id_emisor_garan") <=> repos_aux2.col("id_emisor_garan"))
.join(repos_aux1.withColumnRenamed("id_garante","id_garante_aux1"), repos_garan.col("id_garante") <=> repos_aux1.col("id_garante"))
.withColumn("contract_id", func_nctaFormat_udf(col("ncta")))
.withColumn("asset_type_code", when((col("rating_ext") !== null) && (col("rating_ext") !== "") && (!col("rating_ext").startsWith("SIN")), func_assetType_udf(col("rating_ext"))).otherwise(when((col("tipo_garante") === "National/Sovereign Government") || (col("tipo_garante") === "GOVT NATIONAL"),func_assetType_udf(col("rating_ext_sov"))).otherwise(when((col("isin") !== null) && (col("isin") !== ""),func_assetType_udf(col("asset_type_aux2"))).otherwise(func_assetType_udf(col("asset_type_aux1"))))))
.withColumn("guarantee_code",col("asset_type_code"))
.select("contract_id","asset_type_code","guarantee_code","venc_gran")

我可以在每个val上执行一个.show(),它在repos_final中的工作效果更好,它创建了79999个任务并且崩溃了。我觉得我的代码没有效率,因为我没有太多的经验。

有什么建议吗?

0 个答案:

没有答案