我有以下配置单元查询
DROP TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity;
CREATE TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity (
experiment_name varchar(255),
variant_name varchar(255),
first_date string,
guid string,
click_date date,
create int,
publish int,
sumCreate int,
sumPublish int
)
PARTITIONED BY (click_date date)
STORED AS ORC tblproperties("compress.mode"="SNAPPY");
INSERT INTO TABLE dwo_analysis.spark_custom_global_authenticated_experiment_dashboard_report_activity
SELECT
'EmailDripCampaignGlobal' as experiment_name,
'treatment' as variant_name,
MIN(TO_DATE(b.min_date)) as first_date,
SUBSTR(post_evar12,1,24) AS guid,
click_date,
MAX(CASE WHEN post_prop5='project:createClicked' THEN 1 ELSE 0 END) AS create,
MAX(CASE WHEN post_prop5='project:exportCompleted' OR post_prop5='project:reExportCompleted' THEN 1 ELSE 0 END) AS publish,
SUM(CASE WHEN post_prop5='project:createClicked' THEN 1 ELSE 0 END) AS sumCreate,
SUM(CASE WHEN post_prop5='project:exportCompleted' OR post_prop5='project:reExportCompleted' THEN 1 ELSE 0 END) AS sumPublish
FROM sourcedata.sc_visitor_click_history_jun_2015 sc
INNER JOIN dwo_analysis.spark_experiment_email_drip_treatment b
ON SUBSTR(sc.post_evar12,1,24) = b.guid
WHERE report_suite='adbemmarvelweb.prod'
AND sc.date_time >= b.min_date
AND click_date >= '2018-01-01'
AND click_date < DATE_SUB(CURRENT_DATE, 3)
GROUP BY SUBSTR(post_evar12,1,24), click_date;
执行需要很长时间。有没有人对我如何优化有任何建议?需要很长时间的原因是因为sc_visitor_click_history_jun_2015表是10T重的
答案 0 :(得分:0)
启用配置单元矢量化并尝试。
设置hive.vectorized.execution.enabled = true;