木地板小文件问题

时间:2019-02-22 21:57:52

标签: hadoop hive

我们正面临分区中的小文件问题,导致分布不均和性能问题。

请您协助我们如何避免此问题 enter image description here

蜂巢代码

SET hive.exec.dynamic.partition = TRUE;

设置hive.exec.dynamic.partition.mode = nonstrict;

SET mapreduce.map.memory.mb = 5120;

SET mapreduce.reduce.memory.mb = 5120;

SET mapreduce.map.java.opts = -Xmx4608m;

SET mapreduce.reduce.java.opts = -Xmx4608m;

SET parquet.compression = SNAPPY;

SET hive.exec.compress.output = true;

SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;

SET mapred.output.compression.type = BLOCK;

SET hive.merge.mapfiles = true;

设置hive.merge.smallfiles.avgsize = 1000000;

设置hive.merge.size.per.task = 128000000;

INSERT OVERWRITE表tn_1分区(dt,hr)

选择

tab1.proc_seq_no,

tab1.accs_pont_nm,

tab1.mobl_cnty_cd,

tab1.mobl_ntwk_cd,

tab1.host_url_id

tab1.refr_url_id

tab1.user_agnt_id,

tab1.qry_domn_nm,

tab1.rtsp_user_agnt_id

tab1.smtp_frst_err_cd,

tab1.pop3_frst_fail_resp_cd,

tab1.qsms_appl_stck_nm,

tab1.qsms_appl_fmly_nm,

tab1.imsi_id

tab1.imsi_id_pii_md5,

tab1.msisdn_id

tab1.msisdn_id_pii_md5,

tab1.imei_id

tab1.imei_id_pii_md5,

tab1.ntwk_tech_type_cd,

tab1.locl_ts,

tab1.utc_ts,

tab1.caus_cd,

tab1.frst_url_id

tab1.rtsp_url_id,

tab1.dvic_modl_nm,

tab1.dvic_mnfc_nm,

tab1.clnt_ip_id,

tab1.clnt_ip_id_pii_md5,

tab1.serv_ip_id,

tab1.subs_type_cd,

tab1.rslv_ip_id,

tab1.min_flow_strt_ts,

tab1.max_flow_end_ts,

tab1.tcp_mdte_cnnc_qty,

tab1.tcp_mdte_rtt_qty,

tab1.upld_totl_time_qty,

tab1.dwld_totl_time_qty,

tab1.upld_totl_byte_qty,

tab1.dwld_totl_byte_qty,

tab1.upld_medn_rtr_qty,

tab1.dwld_medn_rtr_qty,

tab1.http_1xx_resp_qty,

tab1.http_2xx_resp_qty,

tab1.http_3xx_resp_qty,

tab1.http_4xx_resp_qty

tab1.http_5xx_resp_qty,

tab1.rtsp_1xx_resp_qty,

tab1.rtsp_2xx_resp_qty,

tab1.rtsp_3xx_resp_qty,

tab1.rtsp_4xx_resp_qty,

tab1.rtsp_5xx_resp_qty,

tab1.ftp_file_trnf_qty,

tab1.imap_mail_qty,

tab1.smtp_recv_qty,

tab1.pop3_mail_qty,

tab1.tran_qty,

tab1.upld_medn_tput_qty,

tab1.dwld_medn_tput_qty,

tab1.ECF_XTRC_TS,

tab1.ECF_DELT_FLG,

tab1.ECF_NSRT_PROC_RUN_NO,

tab1.ECF_SRCE_SYST_CD,

tab1.ECF_OPEN_TS,

tab1.ECF_CLSE_TS,

tab1.ECF_NSRT_TS,

tab1.dt,

tab1.hr

从(

SELECT proc_seq_nr AS proc_seq_no,

apn AS accs_pont_nm,

mcc AS mobl_cnty_cd,

mnc AS mobl_ntwk_cd,

主机AS host_url_id,

引荐来源AS refr_url_id

ua AS user_agnt_id,

query_name AS qry_domn_nm,

rtsp_ua AS rtsp_user_agnt_id

smtp_first_err_code AS smtp_frst_err_cd,

first_failed_resp AS pop3_frst_fail_resp_cd,

qosmos_app_stack AS qsms_appl_stck_nm,

qosmos_app_family AS qsms_appl_fmly_nm,

imsi AS imsi_id,

MD5(imsi)AS imsi_id_PII_MD5,

msisdn AS msisdn_id,

MD5(msisdn)AS msisdn_id_PII_MD5,

imei AS imei_id,

MD5(imei)AS imei_id_PII_MD5,

network_technology AS ntwk_tech_type_cd,

CAST(本地化时间戳记AS时间戳记)locl_ts,

CAST(utc_timestamp AS时间戳)utc_ts,

cause_code AS caus_cd,

url AS frst_url_id

rtsp_url AS rtsp_url_id

device_model AS dvic_modl_nm,

device_manufacturer AS dvic_mnfc_nm,

client_ip AS clnt_ip_id,

MD5(client_ip)AS clnt_ip_id_PII_MD5,

server_ip AS serv_ip_id,

subscriber_type AS subs_type_cd

resolved_ip_add AS rslv_ip_id,

min_flow_start AS min_flow_strt_ts,

max_flow_end AS max_flow_end_ts,

tcp_med_conn_time AS tcp_mdte_cnnc_qty,

tcp_med_rtt AS tcp_mdte_rtt_qty,

data_total_up_time AS upld_totl_time_qty,

data_total_down_time AS dwld_totl_time_qty,

data_total_up_vol AS upld_totl_byte_qty,

data_total_down_vol AS dwld_totl_byte_qty,

data_med_up_rtr_cnt AS upld_medn_rtr_qty,

data_med_down_rtr_cnt AS dwld_medn_rtr_qty,

http_1xx_resp AS http_1xx_resp_qty

http_2xx_resp AS http_2xx_resp_qty

http_3xx_resp AS http_3xx_resp_qty

http_4xx_resp AS http_4xx_resp_qty

http_5xx_resp AS http_5xx_resp_qty

rtsp_1xx_resp AS rtsp_1xx_resp_qty

rtsp_2xx_resp AS rtsp_2xx_resp_qty,

rtsp_3xx_resp AS rtsp_3xx_resp_qty,

rtsp_4xx_resp AS rtsp_4xx_resp_qty,

rtsp_5xx_resp AS rtsp_5xx_resp_qty,

ftp_file_transfer AS ftp_file_trnf_qty,

imap_mail_count AS imap_mail_qty,

smtp_receivers AS smtp_recv_qty,

pop3_mail_count AS pop3_mail_qty,

data_total_hit AS tran_qty,

data_med_up_throughput AS upld_medn_tput_qty,

data_med_down_throughput AS dwld_medn_tput_qty,

CAST(CONCAT(SUBSTR(DT,1,4),'-',SUBSTR(DT,5,2),'-',SUBSTR(DT,7,2),'',SUBSTR(HR,1 ,2),':00:00')AS TIMESTAMP)AS ECF_XTRC_TS,

'N'AS ECF_DELT_FLG,

按ECF_NSRT_PROC_RUN_NO,按CAST(“ 1000”为十进制)AS,

“ NDC” AS ECF_SRCE_SYST_CD,

CURRENT_TIMESTAMP()AS ECF_OPEN_TS,

'NULL'AS ECF_CLSE_TS,

CURRENT_TIMESTAMP()AS ECF_NSRT_TS,

DT,

HR

FROM dev1_DP_TEMP.mobl_data_hr_summ_nsit_mud)标签1

0 个答案:

没有答案