我需要运行多个作业,这些作业包含大量的聚合操作(COALESCE,Group By,Left Join)和条件检查(在Impala中的簇中有超过1G行的多个表)可以处理工作,但我觉得它们花费了太多时间,有些查询可能需要5到10个小时才能完成。
我不知道总体上优化耗时的重查询的最佳实践是什么?
一个典型的查询如下所示,但是我只是在这里问一个通用的最佳实践,因此该查询在此处发布仅供参考。
任何在此分享的想法都将受到赞赏。
select distinct
,coalesce(w.pmaw,w2.pmaw,w3.pmaw) as pmaw
,if(w.pmaw is not null, 'pmaw_sheet',if(w2.pmaw is not null, 'gap_fill_by_mma_in_same_region','gap_fill_by_max_mma_of_all_regions')) as pmaw_source
,if(w.pmaw is not null,1,0) as is_actual_pmaw
,i.date_pull -- partitioned col needs to be last in select
from (
select dr.date_pull,dr.id_acct,dr.id_cust,dr.acct_presence,pas_rg.leaflkp as dim_pas_region_key,ah_prd.LeafLkp as dim_ah_product_key
,if( coalesce( ah_prd.L4_Stype,pas_prd.l3_category_cd) ='PER','MMA pmaw', 'BMMA pmaw') as pmaw_mma_lob
from pasle.${VERSION}_fact_driver dr --fact_driver: 985164794 rows
left join pasle.${VERSION}_ah01 ah on dr.date_pull=ah.date_pull and dr.id_acct=ah.id_acct --ah01: 894956463 rows
left join pasle.${VERSION}_pas pas on pas.date_pull=dr.date_pull and pas.acct_num=dr.id_acct --pas: 502785263 rows
left join pasle.${VERSION}_dim_ah_product ah_prd --1241 rows
on isnull(ah.prod_code_1,'')=isnull(ah_prd.l1_class,'')
and isnull(ah.prod_code_2,'')=isnull(ah_prd.l2_service,'')
and isnull(ah.prod_code_3,'')=isnull(ah_prd.l3_ptype,'')
and isnull(ah.prod_code_4,'')=isnull(ah_prd.l4_stype,'')
--remove leading zeroes in prod_code_5 for values that are all numeric to match the dimension table values.
and isnull(case when regexp_like(ah.prod_code_5,'^[0-9]+$') then cast(cast(ah.prod_code_5 as int) as string) else ah.prod_code_5 end,'')=isnull(ah_prd.appl_product_type,'')
left join pasle.${VERSION}_dim_pas_product pas_prd on pas_prd.product_cd=pas.product_cd --dim_pas_product: 326 rows
left join pasle.${VERSION}_dim_pas_region pas_rg on cast(pas_rg.bank_cd as string)=pas.bank_cd and cast(pas_rg.region_cd as string) =pas.region_cd --dim_pas_region: 23 rows
) i
-- pmaw straight from pmaw sheet
left join pasle.${VERSION}_pmaw_sheet w on w.date_pull = i.date_pull and w.dim_pas_region_key=i.dim_pas_region_key and w.dim_ah_product_key =i.dim_ah_product_key --pmaw_sheet: 315795
-- pmaw gap fill case 1 : when an account matches to pmaw sheet by region but not by product, take MMA/BBMA pmaw.
-- the following combinations from acct_presence=both will fall into this category :
--CHECKING DDA
--Retirement IRA
--Retirement QP
--Retirement SAV
--All PAS accounts fall into this category. match on region but not product
left join pasle.${VERSION}_pmaw_sheet w2 on w2.pmawproduct in ('MMA pmaw','BMMA pmaw') --pmaw_sheet: 315795
and w2.date_pull = i.date_pull and w2.dim_pas_region_key=i.dim_pas_region_key and w2.pmawproduct = i.pmaw_mma_lob
-- pmaw gap fill case 2 : for all AH_only accounts, that don't match product or region, take the average of pmaw.
-- all accounts in ah_only where ah_product_class in (Investment,Loan,Other,Unmapped) and not match to region, we will take max(pmaw) of MMA product of corresponding lob.
left join (select date_pull,pmawproduct,max(pmaw) as pmaw from pasle.${VERSION}_pmaw_sheet where pmawproduct in ('MMA pmaw','BMMA pmaw') group by date_pull ,pmawproduct) w3 --pmaw_sheet: 315795
on w3.date_pull = i.date_pull and w3.pmawproduct = i.pmaw_mma_lob