我在具有所有复杂数据类型的avro文件上有一个配置单元表,我在查询中选择了很少的列,并且在8列上也有分组依据。当我使用此选择查询插入另一个配置单元表时,它创建了200多个空零件文件。
我正在使用以下属性来减少它,但到目前为止没有帮助
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set mapred.reduce.tasks=1;
INSERT INTO TABLE flat_target
select COALESCE(col1,'') as col1, Createts, COALESCE(ECatg,'') as ECatg, ESN, EndDeviceEventType, COALESCE(LogTP,'') as LogTP, COALESCE(Scd,'') Scd, SevirityCD, COALESCE(SeqN,'') as SeqN, COALESCE(DeviceEventAttributeName,'') DeviceEventAttributeName, COALESCE(DeviceEventAttributeValue,'') DeviceEventAttributeValue, ingesttimestamp
FROM
(select m.col1,
m.Createts,
min(case when m.enddeviceeventdetailsname = "ECatg" then m.enddeviceeventdetailsvalue end) as ECatg,
m.ESN,
m.EndDeviceEventType,
min(case when m.enddeviceeventdetailsname = "Elog" then m.enddeviceeventdetailsvalue end) as LogTP,
min(case when m.enddeviceeventdetailsname = "Esrc" then m.enddeviceeventdetailsvalue end) as Scd,
m.SevirityCD,
min(case when m.enddeviceeventdetailsname = "ESeq" then m.enddeviceeventdetailsvalue end) as SeqN,
m.DeviceEventAttributeName,
m.DeviceEventAttributeValue,
m.ingesttimestamp
from
(select
ev_v.col1 col1,
ev_v.Createts Createts,
assetnames_v.assetnamesname ESN,
assetnames_v.assetnametype.assetnametypename EndDeviceEventType,
ev_v.severity as SevirityCD,
enddeviceeventnames_v.enddeviceeventnametype.enddeviceeventnametypename DeviceEventAttributeName,
enddeviceeventnames_v.enddeviceeventnamesname DeviceEventAttributeValue,
enddeviceeventdetails_v.enddeviceeventdetailsname,
enddeviceeventdetails_v.enddeviceeventdetailsvalue,
ingesttimestamp
FROM (select * from events x, control_table t WHERE x.ingesttimestamp >= t.last_process_timestamp and x.ingesttimestamp < t.current_process_timestamp )mt
LATERAL VIEW outer explode(payload.ev) ev_t AS ev_v
LATERAL VIEW outer explode(ev_v.asset.assetnames) assetnames_t as assetnames_v
LATERAL VIEW outer explode(ev_v.enddeviceeventnames) enddeviceeventnames_t as enddeviceeventnames_v
lateral view outer inline(ev_v.enddeviceeventdetails) enddeviceeventdetails_v) m
GROUP BY
m.col1,
m.Createts,
m.ESN,
m.EndDeviceEventType,
m.SevirityCD,
m.DeviceEventAttributeName,
m.DeviceEventAttributeValue,
m.ingesttimestamp) me WHERE me.SeqN IS NULL;