Spark Sql-运行两次

时间:2019-07-10 19:19:22

标签: apache-spark apache-spark-sql apache-spark-dataset apache-spark-2.0

我遇到了ETL流程的Spark代码,因为编写了很长的复杂sql语句的代码,而且它们都面临oom错误,有时一次执行同一代码需要花费4个小时。

他们有许多这样的etl流程,我在这里将示例查询粘贴了带有嵌套和聚合,分组依据,ordr依此类推的长时间复杂连接。顺便说一句,它仍然不是完整的查询

我是大数据科学家,我承担了调整其性能的任务,对我而言,这些工作毫无用处。寻找您的建议。

请查看下面的查询,他们将其用作

SqlContext.sql(below query).write.mode(append).insertinto(hivetbl)

这是利用火花的正确方法吗?


SELECT 
    CASE WHEN MXEM.MXEM_DTE_CRE > MVE.mve_dateem
     THEN MXEM.MXEM_DTE_CRE
     ELSE 
     MVE.mve_dateem END AS inception_date

FROM axa_gulf_gulf_dc_ciris.TUN_MVE MVE
INNER JOIN axa_gulf_health_bv.policy_avn_stg MVE_F ON (
        MVE.MVE_AGENCE = MVE_F.MVE_AGENCE
        AND MVE.MVE_BR = MVE_F.MVE_BR
        AND MVE.MVE_SERIE = MVE_F.MVE_SERIE
        AND MVE.MVE_CERT = MVE_F.MVE_CERT
        AND MVE.MVE_AVN = MVE_F.MVE_AVN
        )
LEFT JOIN LOB_MXP4 MXP4 ON MXP4.MXP4_AGENCE = MVE_F.MVE_AGENCE
    AND MXP4.MXP4_BR = MVE_F.MVE_BR
    AND MXP4.MXP4_SERIE = MVE_F.MVE_SERIE
    AND MXP4.MXP4_CERT = MVE_F.MVE_CERT
    AND MXP4.MXP4_AVN = MVE_F.MVE_MAX_AVN
INNER JOIN TUN_MVE MVEM ON MXP4.MXP4_AGENCE = MVEM.MVE_AGENCE
    AND MXP4.MXP4_BR = MVEM.MVE_BR
    AND MXP4.MXP4_SERIE = MVEM.MVE_SERIE
    AND MXP4.MXP4_CERT = MVEM.MVE_CERT
    AND MVEM.MVE_AVN = MVE_F.MVE_LAST_AVN
INNER JOIN LOB_MXEM MXEM ON (
        MXEM.mxem_cli_num_1 = mve.mve_cli1
        AND MXEM.mxem_cli_num_2 = mve.mve_cli2
        AND MXEM.mxem_num_1 = mxp4.mxp4_num_1
        AND MXEM.mxem_num_2 = mxp4.mxp4_num_2
        )
    AND (
        (
            mxp4.mxp4_flag_del = 'Y'
            AND MXEM.mxem_dte_cre <> NVL(MXEM.mxem_dte_del, 0)
            )
        OR mxp4.mxp4_flag_del = 'N'
        )
INNER JOIN LOB_MXP5 ON (
        mxp5_agence = mve.mve_agence
        AND mxp5_br = mve.mve_br
        AND mxp5_serie = mve.mve_serie
        AND mxp5_cert = mve.mve_cert
        AND mxp5_avn = mve.mve_avn
        )
LEFT JOIN (
    SELECT mxp4_agence dmxp4_agence
        ,mxp4_br dmxp4_br
        ,mxp4_serie dmxp4_serie
        ,mxp4_cert dmxp4_cert
        ,mxp4_flag_del dmxp4_flag_del
        ,mxp4_avn dmxp4_avn
    FROM LOB_MXP4
    ) MXP4_DEL ON (
        MXP4_DEL.dmxp4_agence = mxp4.mxp4_agence
        AND MXP4_DEL.dmxp4_br = mxp4.mxp4_br
        AND MXP4_DEL.dmxp4_serie = mxp4.mxp4_serie
        AND MXP4_DEL.dmxp4_cert = mxp4.mxp4_cert
        AND MXP4_DEL.dmxp4_flag_del = 'Y'
        )
LEFT JOIN LOB_MXP3 MXP3 ON (
        MXP3.MXP3_AGENCE = MVE.MVE_AGENCE
        AND MXP3.MXP3_BR = MVE.MVE_BR
        AND MXP3.MXP3_SERIE = MVE.MVE_SERIE
        AND MXP3.MXP3_CERT = MVE.MVE_CERT
        AND MXP3.MXP3_AVN = MVE.MVE_AVN
        )
LEFT JOIN (
    SELECT *
    FROM DRI_VOC
    WHERE voc_lang = 'E'
        AND voc_table_code = '50000'
    ) v_mxrg ON mxem.mxem_rel_grp = v_mxrg.voc_code
LEFT JOIN (
    SELECT fc_code
        ,voc_des fc_des
    FROM DRI_FC
        ,DRI_VOC
    WHERE fc_code = voc_code
        AND fc_code != 'ALL'
        AND voc_lang = 'E'
        AND voc_table_code = '00008'
    ) V_FC ON MVEM.mve_dev_prime = v_fc.fc_code
LEFT JOIN DRI_FC FC ON fc.fc_code = V_FC.fc_code
LEFT JOIN REF_ASR CLI ON MVE.mve_cli1 = cli.ASR_NUM_1
    AND MVE.MVE_CLI2 = cli.ASR_NUM_2
LEFT JOIN LOB_MXP4 PMXP4 ON (
        PMXP4.mxp4_agence = mve.mve_agence
        AND PMXP4.mxp4_br = mve.mve_br
        AND PMXP4.mxp4_serie = mve.mve_serie
        AND PMXP4.mxp4_cert = mve.mve_cert
        AND PMXP4.mxp4_avn <= MXP4.mxp4_avn
        AND PMXP4.mxp4_avn >= MVEM.MVE_AVN
        AND PMXP4.mxp4_num_1 = mxem_num_1
        AND PMXP4.mxp4_num_2 = mxem_num_2
        )
LEFT JOIN (
    SELECT VOC_CODE
        ,VOC_DES
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50002'
    ) CLS ON CLS.VOC_CODE = mxp3.mxp3_class
LEFT JOIN (
    SELECT VOC_CODE
        ,VOC_DES
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '00133'
    ) SEX ON SEX.VOC_CODE = mxem.mxem_sex
LEFT JOIN (
    SELECT VOC_CODE
        ,VOC_DES
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50000'
    ) RG ON RG.VOC_CODE = mxem.mxem_rel_grp
LEFT JOIN (
    SELECT VOC_CODE
        ,VOC_DES
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50545'
    ) GDRFA ON GDRFA.VOC_CODE = mxem.MXEM_GDRFA_STATUS
LEFT JOIN (
    SELECT BR_CODE BRANCH_CODE
        ,voc_des BRANCH_DES
    FROM UDW_BR
        ,DRI_VOC
    WHERE br_code = voc_code
        AND voc_table_code = '00003'
        AND br_inuse = 'Y'
        AND voc_lang = 'E'
    ) BRANCH ON BRANCH_CODE = MVE.MVE_AGENCE
LEFT JOIN (
    SELECT cpr_code
        ,voc_des cpr_des
    FROM UDW_CPR
        ,DRI_VOC
    WHERE CPR_code = voc_code
        AND voc_table_code = '00035'
        AND voc_lang = 'E'
    ) PKG ON PKG.CPR_CODE = MVE.MVE_PROD
LEFT JOIN (
    SELECT CT_CODE AS LOB_CODE
        ,VOC_DES AS LOB_DES
    FROM UDW_CT
        ,DRI_VOC
    WHERE CT_CODE = VOC_CODE
        AND CT_SP_FRM IS NOT NULL
        AND VOC_TABLE_CODE = '00009'
        AND VOC_LANG = 'E'
    ) LB ON LOB_CODE = MVE.MVE_BR
LEFT JOIN REF_DEFROLE DR ON CLI.ASR_ACC_EXEC = DR.DEFROLE_ID
LEFT JOIN REF_DEFROLE CRT ON MVE.MVE_AGENT = CRT.DEFROLE_ID
LEFT JOIN REF_PERSON CRTI ON CRT.DEFROLE_INTERNAL_ID = CRTI.PERSON_INTERNAL_ID
LEFT JOIN REF_PERSON RP ON DR.DEFROLE_INTERNAL_ID = RP.PERSON_INTERNAL_ID
LEFT JOIN (
    SELECT VOC_CODE
        ,VOC_DES NATINALITY_DESC
        ,pst_region
        ,pst_isocode
    FROM DRI_pst
        ,DRI_voc
    WHERE pst_quartier = '0'
        AND pst_rue = '0'
        AND pst_region = voc_code
        AND voc_LANG = 'E'
        AND voc_table_code = '00007'
    ) VP ON vp.pst_region = MXEM.mxem_county
LEFT JOIN LOB_DHANATIONALITYCODE dh ON dh.dhanationalitycode_iso_2 = vp.pst_isocode
LEFT JOIN DRI_transco EM ON transco_table_code = '50532'
    AND EM.transco_code = mxem.mxem_visa_info
LEFT JOIN LOR_LBG LOB_GP ON LOB_GP.lbg_lob = MVE.mve_br
LEFT JOIN (
    SELECT MVE_AGENCE
        ,MVE_BR
        ,MVE_SERIE
        ,MVE_CERT
        ,MAX(mve_echean) mve_echean
    FROM TUN_MVE
    GROUP BY MVE_AGENCE
        ,MVE_BR
        ,MVE_SERIE
        ,MVE_CERT
    ) FLTR ON (
        MVE.MVE_AGENCE = FLTR.MVE_AGENCE
        AND MVE.MVE_BR = FLTR.MVE_BR
        AND MVE.MVE_SERIE = FLTR.MVE_SERIE
        AND MVE.MVE_CERT = FLTR.MVE_CERT
        )
LEFT JOIN (
    SELECT MVE_AGENCE
        ,MVE_BR
        ,MVE_SERIE
        ,MVE_CERT
        ,MAX(MVE_DATEFF_AVN) MVE_DATEFF_AVN
    FROM TUN_MVE
    WHERE MVE_TYP_OPER = 'E'
    GROUP BY MVE_AGENCE
        ,MVE_BR
        ,MVE_SERIE
        ,MVE_CERT
    ) FLTR2 ON (
        MVE.MVE_AGENCE = FLTR2.MVE_AGENCE
        AND MVE.MVE_BR = FLTR2.MVE_BR
        AND MVE.MVE_SERIE = FLTR2.MVE_SERIE
        AND MVE.MVE_CERT = FLTR2.MVE_CERT
        )
LEFT JOIN tun_mve p ON (
        MVE.MVE_AGENCE = P.MVE_AGENCE
        AND MVE.MVE_BR = P.MVE_BR
        AND MVE.MVE_SERIE = P.MVE_SERIE
        AND MVE.MVE_CERT = P.MVE_CERT
        AND P.MVE_AVN = MVE_F.MVE_MR_AVN
        )
LEFT JOIN (
    SELECT voc_code
        ,voc_des
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50011'
    ) NTWRK ON (NTWRK.VOC_code = mxp3.mxp3_network)
LEFT JOIN (
    SELECT voc_code
        ,voc_des
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50011'
    ) NTWRKIP ON (NTWRKIP.VOC_code = mxp3.mxp3_network_IP)
LEFT JOIN (
    SELECT mxsalaryband_code
        ,voc_des mxsalaryband_des
    FROM DRI_VOC
        ,lor_MXSALARYBAND
    WHERE mxsalaryband_code = VOC_CODE
        AND voc_table_code = '50537'
        AND voc_lang = 'E'
    ) SBAND ON SBAND.mxsalaryband_code = MXEM.mxem_salary_abv_flag
LEFT JOIN (
    SELECT voc_code MEDICAL_AUTHORITY_CODE
        ,voc_des MEDICAL_AUTHORITY_DES
    FROM DRI_VOC
    WHERE voc_LANG = 'E'
        AND voc_table_code = '50520'
    ) AUTH ON (AUTH.MEDICAL_AUTHORITY_CODE = MXEM.mxem_authority)
LEFT JOIN LOR_MAREA MAREAH ON MAREAH.marea_countrycod = MXEM.mxem_loc_country
    AND MAREAH.marea_cod = MXEM.mxem_loc_cityh
LEFT JOIN LOR_SAREA SAREAH ON SAREAH.sarea_countrycod = MXEM.mxem_loc_country
    AND SAREAH.sarea_mareacod = MXEM.mxem_loc_cityh
    AND SAREAH.sarea_cod = MXEM.mxem_loc_areah
LEFT JOIN LOR_MAREA MAREAW ON MAREAW.marea_countrycod = MXEM.mxem_loc_country
    AND MAREAW.marea_cod = MXEM.mxem_loc_cityw
LEFT JOIN (SELECT SUM(NVL(instprm_deja_paye,0)) instprm_deja_paye,SUM(NVL(instprm_taux,0)) instprm_taux,SUM(NVL(instprm_prime_t,0)) instprm_prime_t,instprm_agence,instprm_br,instprm_serie,instprm_cert
             FROM TUN_INSTPRM
            GROUP BY instprm_agence,instprm_br,instprm_serie,instprm_cert
            ) INSTPRM 
            ON INSTPRM.instprm_agence = MVE.mve_agence
            AND INSTPRM.instprm_br = MVE.mve_br
            AND INSTPRM.instprm_serie = MVE.mve_serie
            AND INSTPRM.instprm_cert = MVE.mve_cert
 LEFT JOIN (
    SELECT voc_code dist_channel_code
        ,voc_des dist_channel_des
    FROM DRI_VOC
    WHERE voc_lang = 'E'
        AND voc_table_code = '00115'
    ) CHN ON CRT.defrole_dist_channel = CHN.dist_channel_code
LEFT JOIN LOR_SAREA SAREAW ON SAREAW.sarea_countrycod = MXEM.mxem_loc_country
    AND SAREAW.sarea_mareacod = MXEM.mxem_loc_cityw
    AND SAREAW.sarea_cod = MXEM.mxem_loc_areaw
WHERE MVE.MVE_AGENCE IN (
        12
        ,13
        )
GROUP BY 
    CASE WHEN MXEM.MXEM_DTE_CRE > MVE.mve_dateem THEN MXEM.MXEM_DTE_CRE
     ELSE MVE.mve_dateem END

2 个答案:

答案 0 :(得分:0)

您是否尝试将SQL分成多个片段?每个片段大概1个连接?

  1. 加入连接,将输出写入拼花文件
  2. 将输出与另一个表合并,并将其转储到实木复合地板文件中
  3. 重复以上操作。

大数据中的联接是分布式的,并且有很多基础假设。顺便说一句,对于您的用例prestodb似乎是合适的选择。

答案 1 :(得分:0)

我也有同样的问题。 Spark通过运行带有空结果的用户查询来推断列的类型。 (请查看https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala行112,getSchemaQuery函数)

简单的查询,例如(select ... from ...),其中1 = 0会非常快,但是具有join操作的查询会慢于join操作的速度。

也许您可以使用(https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html)“ customSchema”选项来防止引发推断您的结果类型。