我遇到了ETL流程的Spark代码,因为编写了很长的复杂sql语句的代码,而且它们都面临oom错误,有时一次执行同一代码需要花费4个小时。
他们有许多这样的etl流程,我在这里将示例查询粘贴了带有嵌套和聚合,分组依据,ordr依此类推的长时间复杂连接。顺便说一句,它仍然不是完整的查询
我是大数据科学家,我承担了调整其性能的任务,对我而言,这些工作毫无用处。寻找您的建议。
请查看下面的查询,他们将其用作
SqlContext.sql(below query).write.mode(append).insertinto(hivetbl)
这是利用火花的正确方法吗?
SELECT
CASE WHEN MXEM.MXEM_DTE_CRE > MVE.mve_dateem
THEN MXEM.MXEM_DTE_CRE
ELSE
MVE.mve_dateem END AS inception_date
FROM axa_gulf_gulf_dc_ciris.TUN_MVE MVE
INNER JOIN axa_gulf_health_bv.policy_avn_stg MVE_F ON (
MVE.MVE_AGENCE = MVE_F.MVE_AGENCE
AND MVE.MVE_BR = MVE_F.MVE_BR
AND MVE.MVE_SERIE = MVE_F.MVE_SERIE
AND MVE.MVE_CERT = MVE_F.MVE_CERT
AND MVE.MVE_AVN = MVE_F.MVE_AVN
)
LEFT JOIN LOB_MXP4 MXP4 ON MXP4.MXP4_AGENCE = MVE_F.MVE_AGENCE
AND MXP4.MXP4_BR = MVE_F.MVE_BR
AND MXP4.MXP4_SERIE = MVE_F.MVE_SERIE
AND MXP4.MXP4_CERT = MVE_F.MVE_CERT
AND MXP4.MXP4_AVN = MVE_F.MVE_MAX_AVN
INNER JOIN TUN_MVE MVEM ON MXP4.MXP4_AGENCE = MVEM.MVE_AGENCE
AND MXP4.MXP4_BR = MVEM.MVE_BR
AND MXP4.MXP4_SERIE = MVEM.MVE_SERIE
AND MXP4.MXP4_CERT = MVEM.MVE_CERT
AND MVEM.MVE_AVN = MVE_F.MVE_LAST_AVN
INNER JOIN LOB_MXEM MXEM ON (
MXEM.mxem_cli_num_1 = mve.mve_cli1
AND MXEM.mxem_cli_num_2 = mve.mve_cli2
AND MXEM.mxem_num_1 = mxp4.mxp4_num_1
AND MXEM.mxem_num_2 = mxp4.mxp4_num_2
)
AND (
(
mxp4.mxp4_flag_del = 'Y'
AND MXEM.mxem_dte_cre <> NVL(MXEM.mxem_dte_del, 0)
)
OR mxp4.mxp4_flag_del = 'N'
)
INNER JOIN LOB_MXP5 ON (
mxp5_agence = mve.mve_agence
AND mxp5_br = mve.mve_br
AND mxp5_serie = mve.mve_serie
AND mxp5_cert = mve.mve_cert
AND mxp5_avn = mve.mve_avn
)
LEFT JOIN (
SELECT mxp4_agence dmxp4_agence
,mxp4_br dmxp4_br
,mxp4_serie dmxp4_serie
,mxp4_cert dmxp4_cert
,mxp4_flag_del dmxp4_flag_del
,mxp4_avn dmxp4_avn
FROM LOB_MXP4
) MXP4_DEL ON (
MXP4_DEL.dmxp4_agence = mxp4.mxp4_agence
AND MXP4_DEL.dmxp4_br = mxp4.mxp4_br
AND MXP4_DEL.dmxp4_serie = mxp4.mxp4_serie
AND MXP4_DEL.dmxp4_cert = mxp4.mxp4_cert
AND MXP4_DEL.dmxp4_flag_del = 'Y'
)
LEFT JOIN LOB_MXP3 MXP3 ON (
MXP3.MXP3_AGENCE = MVE.MVE_AGENCE
AND MXP3.MXP3_BR = MVE.MVE_BR
AND MXP3.MXP3_SERIE = MVE.MVE_SERIE
AND MXP3.MXP3_CERT = MVE.MVE_CERT
AND MXP3.MXP3_AVN = MVE.MVE_AVN
)
LEFT JOIN (
SELECT *
FROM DRI_VOC
WHERE voc_lang = 'E'
AND voc_table_code = '50000'
) v_mxrg ON mxem.mxem_rel_grp = v_mxrg.voc_code
LEFT JOIN (
SELECT fc_code
,voc_des fc_des
FROM DRI_FC
,DRI_VOC
WHERE fc_code = voc_code
AND fc_code != 'ALL'
AND voc_lang = 'E'
AND voc_table_code = '00008'
) V_FC ON MVEM.mve_dev_prime = v_fc.fc_code
LEFT JOIN DRI_FC FC ON fc.fc_code = V_FC.fc_code
LEFT JOIN REF_ASR CLI ON MVE.mve_cli1 = cli.ASR_NUM_1
AND MVE.MVE_CLI2 = cli.ASR_NUM_2
LEFT JOIN LOB_MXP4 PMXP4 ON (
PMXP4.mxp4_agence = mve.mve_agence
AND PMXP4.mxp4_br = mve.mve_br
AND PMXP4.mxp4_serie = mve.mve_serie
AND PMXP4.mxp4_cert = mve.mve_cert
AND PMXP4.mxp4_avn <= MXP4.mxp4_avn
AND PMXP4.mxp4_avn >= MVEM.MVE_AVN
AND PMXP4.mxp4_num_1 = mxem_num_1
AND PMXP4.mxp4_num_2 = mxem_num_2
)
LEFT JOIN (
SELECT VOC_CODE
,VOC_DES
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50002'
) CLS ON CLS.VOC_CODE = mxp3.mxp3_class
LEFT JOIN (
SELECT VOC_CODE
,VOC_DES
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '00133'
) SEX ON SEX.VOC_CODE = mxem.mxem_sex
LEFT JOIN (
SELECT VOC_CODE
,VOC_DES
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50000'
) RG ON RG.VOC_CODE = mxem.mxem_rel_grp
LEFT JOIN (
SELECT VOC_CODE
,VOC_DES
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50545'
) GDRFA ON GDRFA.VOC_CODE = mxem.MXEM_GDRFA_STATUS
LEFT JOIN (
SELECT BR_CODE BRANCH_CODE
,voc_des BRANCH_DES
FROM UDW_BR
,DRI_VOC
WHERE br_code = voc_code
AND voc_table_code = '00003'
AND br_inuse = 'Y'
AND voc_lang = 'E'
) BRANCH ON BRANCH_CODE = MVE.MVE_AGENCE
LEFT JOIN (
SELECT cpr_code
,voc_des cpr_des
FROM UDW_CPR
,DRI_VOC
WHERE CPR_code = voc_code
AND voc_table_code = '00035'
AND voc_lang = 'E'
) PKG ON PKG.CPR_CODE = MVE.MVE_PROD
LEFT JOIN (
SELECT CT_CODE AS LOB_CODE
,VOC_DES AS LOB_DES
FROM UDW_CT
,DRI_VOC
WHERE CT_CODE = VOC_CODE
AND CT_SP_FRM IS NOT NULL
AND VOC_TABLE_CODE = '00009'
AND VOC_LANG = 'E'
) LB ON LOB_CODE = MVE.MVE_BR
LEFT JOIN REF_DEFROLE DR ON CLI.ASR_ACC_EXEC = DR.DEFROLE_ID
LEFT JOIN REF_DEFROLE CRT ON MVE.MVE_AGENT = CRT.DEFROLE_ID
LEFT JOIN REF_PERSON CRTI ON CRT.DEFROLE_INTERNAL_ID = CRTI.PERSON_INTERNAL_ID
LEFT JOIN REF_PERSON RP ON DR.DEFROLE_INTERNAL_ID = RP.PERSON_INTERNAL_ID
LEFT JOIN (
SELECT VOC_CODE
,VOC_DES NATINALITY_DESC
,pst_region
,pst_isocode
FROM DRI_pst
,DRI_voc
WHERE pst_quartier = '0'
AND pst_rue = '0'
AND pst_region = voc_code
AND voc_LANG = 'E'
AND voc_table_code = '00007'
) VP ON vp.pst_region = MXEM.mxem_county
LEFT JOIN LOB_DHANATIONALITYCODE dh ON dh.dhanationalitycode_iso_2 = vp.pst_isocode
LEFT JOIN DRI_transco EM ON transco_table_code = '50532'
AND EM.transco_code = mxem.mxem_visa_info
LEFT JOIN LOR_LBG LOB_GP ON LOB_GP.lbg_lob = MVE.mve_br
LEFT JOIN (
SELECT MVE_AGENCE
,MVE_BR
,MVE_SERIE
,MVE_CERT
,MAX(mve_echean) mve_echean
FROM TUN_MVE
GROUP BY MVE_AGENCE
,MVE_BR
,MVE_SERIE
,MVE_CERT
) FLTR ON (
MVE.MVE_AGENCE = FLTR.MVE_AGENCE
AND MVE.MVE_BR = FLTR.MVE_BR
AND MVE.MVE_SERIE = FLTR.MVE_SERIE
AND MVE.MVE_CERT = FLTR.MVE_CERT
)
LEFT JOIN (
SELECT MVE_AGENCE
,MVE_BR
,MVE_SERIE
,MVE_CERT
,MAX(MVE_DATEFF_AVN) MVE_DATEFF_AVN
FROM TUN_MVE
WHERE MVE_TYP_OPER = 'E'
GROUP BY MVE_AGENCE
,MVE_BR
,MVE_SERIE
,MVE_CERT
) FLTR2 ON (
MVE.MVE_AGENCE = FLTR2.MVE_AGENCE
AND MVE.MVE_BR = FLTR2.MVE_BR
AND MVE.MVE_SERIE = FLTR2.MVE_SERIE
AND MVE.MVE_CERT = FLTR2.MVE_CERT
)
LEFT JOIN tun_mve p ON (
MVE.MVE_AGENCE = P.MVE_AGENCE
AND MVE.MVE_BR = P.MVE_BR
AND MVE.MVE_SERIE = P.MVE_SERIE
AND MVE.MVE_CERT = P.MVE_CERT
AND P.MVE_AVN = MVE_F.MVE_MR_AVN
)
LEFT JOIN (
SELECT voc_code
,voc_des
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50011'
) NTWRK ON (NTWRK.VOC_code = mxp3.mxp3_network)
LEFT JOIN (
SELECT voc_code
,voc_des
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50011'
) NTWRKIP ON (NTWRKIP.VOC_code = mxp3.mxp3_network_IP)
LEFT JOIN (
SELECT mxsalaryband_code
,voc_des mxsalaryband_des
FROM DRI_VOC
,lor_MXSALARYBAND
WHERE mxsalaryband_code = VOC_CODE
AND voc_table_code = '50537'
AND voc_lang = 'E'
) SBAND ON SBAND.mxsalaryband_code = MXEM.mxem_salary_abv_flag
LEFT JOIN (
SELECT voc_code MEDICAL_AUTHORITY_CODE
,voc_des MEDICAL_AUTHORITY_DES
FROM DRI_VOC
WHERE voc_LANG = 'E'
AND voc_table_code = '50520'
) AUTH ON (AUTH.MEDICAL_AUTHORITY_CODE = MXEM.mxem_authority)
LEFT JOIN LOR_MAREA MAREAH ON MAREAH.marea_countrycod = MXEM.mxem_loc_country
AND MAREAH.marea_cod = MXEM.mxem_loc_cityh
LEFT JOIN LOR_SAREA SAREAH ON SAREAH.sarea_countrycod = MXEM.mxem_loc_country
AND SAREAH.sarea_mareacod = MXEM.mxem_loc_cityh
AND SAREAH.sarea_cod = MXEM.mxem_loc_areah
LEFT JOIN LOR_MAREA MAREAW ON MAREAW.marea_countrycod = MXEM.mxem_loc_country
AND MAREAW.marea_cod = MXEM.mxem_loc_cityw
LEFT JOIN (SELECT SUM(NVL(instprm_deja_paye,0)) instprm_deja_paye,SUM(NVL(instprm_taux,0)) instprm_taux,SUM(NVL(instprm_prime_t,0)) instprm_prime_t,instprm_agence,instprm_br,instprm_serie,instprm_cert
FROM TUN_INSTPRM
GROUP BY instprm_agence,instprm_br,instprm_serie,instprm_cert
) INSTPRM
ON INSTPRM.instprm_agence = MVE.mve_agence
AND INSTPRM.instprm_br = MVE.mve_br
AND INSTPRM.instprm_serie = MVE.mve_serie
AND INSTPRM.instprm_cert = MVE.mve_cert
LEFT JOIN (
SELECT voc_code dist_channel_code
,voc_des dist_channel_des
FROM DRI_VOC
WHERE voc_lang = 'E'
AND voc_table_code = '00115'
) CHN ON CRT.defrole_dist_channel = CHN.dist_channel_code
LEFT JOIN LOR_SAREA SAREAW ON SAREAW.sarea_countrycod = MXEM.mxem_loc_country
AND SAREAW.sarea_mareacod = MXEM.mxem_loc_cityw
AND SAREAW.sarea_cod = MXEM.mxem_loc_areaw
WHERE MVE.MVE_AGENCE IN (
12
,13
)
GROUP BY
CASE WHEN MXEM.MXEM_DTE_CRE > MVE.mve_dateem THEN MXEM.MXEM_DTE_CRE
ELSE MVE.mve_dateem END
答案 0 :(得分:0)
您是否尝试将SQL分成多个片段?每个片段大概1个连接?
大数据中的联接是分布式的,并且有很多基础假设。顺便说一句,对于您的用例prestodb
似乎是合适的选择。
答案 1 :(得分:0)
我也有同样的问题。 Spark通过运行带有空结果的用户查询来推断列的类型。 (请查看https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala行112,getSchemaQuery函数)
简单的查询,例如(select ... from ...),其中1 = 0会非常快,但是具有join操作的查询会慢于join操作的速度。
也许您可以使用(https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html)“ customSchema”选项来防止引发推断您的结果类型。