我们正面临分区中的小文件问题,导致分布不均和性能问题。
蜂巢代码
SET hive.exec.dynamic.partition = TRUE;
设置hive.exec.dynamic.partition.mode = nonstrict;
SET mapreduce.map.memory.mb = 5120;
SET mapreduce.reduce.memory.mb = 5120;
SET mapreduce.map.java.opts = -Xmx4608m;
SET mapreduce.reduce.java.opts = -Xmx4608m;
SET parquet.compression = SNAPPY;
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.output.compression.type = BLOCK;
SET hive.merge.mapfiles = true;
设置hive.merge.smallfiles.avgsize = 1000000;
设置hive.merge.size.per.task = 128000000;
INSERT OVERWRITE表tn_1分区(dt,hr)
选择
tab1.proc_seq_no,
tab1.accs_pont_nm,
tab1.mobl_cnty_cd,
tab1.mobl_ntwk_cd,
tab1.host_url_id
tab1.refr_url_id
tab1.user_agnt_id,
tab1.qry_domn_nm,
tab1.rtsp_user_agnt_id
tab1.smtp_frst_err_cd,
tab1.pop3_frst_fail_resp_cd,
tab1.qsms_appl_stck_nm,
tab1.qsms_appl_fmly_nm,
tab1.imsi_id
tab1.imsi_id_pii_md5,
tab1.msisdn_id
tab1.msisdn_id_pii_md5,
tab1.imei_id
tab1.imei_id_pii_md5,
tab1.ntwk_tech_type_cd,
tab1.locl_ts,
tab1.utc_ts,
tab1.caus_cd,
tab1.frst_url_id
tab1.rtsp_url_id,
tab1.dvic_modl_nm,
tab1.dvic_mnfc_nm,
tab1.clnt_ip_id,
tab1.clnt_ip_id_pii_md5,
tab1.serv_ip_id,
tab1.subs_type_cd,
tab1.rslv_ip_id,
tab1.min_flow_strt_ts,
tab1.max_flow_end_ts,
tab1.tcp_mdte_cnnc_qty,
tab1.tcp_mdte_rtt_qty,
tab1.upld_totl_time_qty,
tab1.dwld_totl_time_qty,
tab1.upld_totl_byte_qty,
tab1.dwld_totl_byte_qty,
tab1.upld_medn_rtr_qty,
tab1.dwld_medn_rtr_qty,
tab1.http_1xx_resp_qty,
tab1.http_2xx_resp_qty,
tab1.http_3xx_resp_qty,
tab1.http_4xx_resp_qty
tab1.http_5xx_resp_qty,
tab1.rtsp_1xx_resp_qty,
tab1.rtsp_2xx_resp_qty,
tab1.rtsp_3xx_resp_qty,
tab1.rtsp_4xx_resp_qty,
tab1.rtsp_5xx_resp_qty,
tab1.ftp_file_trnf_qty,
tab1.imap_mail_qty,
tab1.smtp_recv_qty,
tab1.pop3_mail_qty,
tab1.tran_qty,
tab1.upld_medn_tput_qty,
tab1.dwld_medn_tput_qty,
tab1.ECF_XTRC_TS,
tab1.ECF_DELT_FLG,
tab1.ECF_NSRT_PROC_RUN_NO,
tab1.ECF_SRCE_SYST_CD,
tab1.ECF_OPEN_TS,
tab1.ECF_CLSE_TS,
tab1.ECF_NSRT_TS,
tab1.dt,
tab1.hr
从(
SELECT proc_seq_nr AS proc_seq_no,
apn AS accs_pont_nm,
mcc AS mobl_cnty_cd,
mnc AS mobl_ntwk_cd,
主机AS host_url_id,
引荐来源AS refr_url_id
ua AS user_agnt_id,
query_name AS qry_domn_nm,
rtsp_ua AS rtsp_user_agnt_id
smtp_first_err_code AS smtp_frst_err_cd,
first_failed_resp AS pop3_frst_fail_resp_cd,
qosmos_app_stack AS qsms_appl_stck_nm,
qosmos_app_family AS qsms_appl_fmly_nm,
imsi AS imsi_id,
MD5(imsi)AS imsi_id_PII_MD5,
msisdn AS msisdn_id,
MD5(msisdn)AS msisdn_id_PII_MD5,
imei AS imei_id,
MD5(imei)AS imei_id_PII_MD5,
network_technology AS ntwk_tech_type_cd,
CAST(本地化时间戳记AS时间戳记)locl_ts,
CAST(utc_timestamp AS时间戳)utc_ts,
cause_code AS caus_cd,
url AS frst_url_id
rtsp_url AS rtsp_url_id
device_model AS dvic_modl_nm,
device_manufacturer AS dvic_mnfc_nm,
client_ip AS clnt_ip_id,
MD5(client_ip)AS clnt_ip_id_PII_MD5,
server_ip AS serv_ip_id,
subscriber_type AS subs_type_cd
resolved_ip_add AS rslv_ip_id,
min_flow_start AS min_flow_strt_ts,
max_flow_end AS max_flow_end_ts,
tcp_med_conn_time AS tcp_mdte_cnnc_qty,
tcp_med_rtt AS tcp_mdte_rtt_qty,
data_total_up_time AS upld_totl_time_qty,
data_total_down_time AS dwld_totl_time_qty,
data_total_up_vol AS upld_totl_byte_qty,
data_total_down_vol AS dwld_totl_byte_qty,
data_med_up_rtr_cnt AS upld_medn_rtr_qty,
data_med_down_rtr_cnt AS dwld_medn_rtr_qty,
http_1xx_resp AS http_1xx_resp_qty
http_2xx_resp AS http_2xx_resp_qty
http_3xx_resp AS http_3xx_resp_qty
http_4xx_resp AS http_4xx_resp_qty
http_5xx_resp AS http_5xx_resp_qty
rtsp_1xx_resp AS rtsp_1xx_resp_qty
rtsp_2xx_resp AS rtsp_2xx_resp_qty,
rtsp_3xx_resp AS rtsp_3xx_resp_qty,
rtsp_4xx_resp AS rtsp_4xx_resp_qty,
rtsp_5xx_resp AS rtsp_5xx_resp_qty,
ftp_file_transfer AS ftp_file_trnf_qty,
imap_mail_count AS imap_mail_qty,
smtp_receivers AS smtp_recv_qty,
pop3_mail_count AS pop3_mail_qty,
data_total_hit AS tran_qty,
data_med_up_throughput AS upld_medn_tput_qty,
data_med_down_throughput AS dwld_medn_tput_qty,
CAST(CONCAT(SUBSTR(DT,1,4),'-',SUBSTR(DT,5,2),'-',SUBSTR(DT,7,2),'',SUBSTR(HR,1 ,2),':00:00')AS TIMESTAMP)AS ECF_XTRC_TS,
'N'AS ECF_DELT_FLG,
按ECF_NSRT_PROC_RUN_NO,按CAST(“ 1000”为十进制)AS,
“ NDC” AS ECF_SRCE_SYST_CD,
CURRENT_TIMESTAMP()AS ECF_OPEN_TS,
'NULL'AS ECF_CLSE_TS,
CURRENT_TIMESTAMP()AS ECF_NSRT_TS,
DT,
HR
FROM dev1_DP_TEMP.mobl_data_hr_summ_nsit_mud)标签1