我有以下查询,映射器完成需要它。
我正在尝试在表格分区和排序之间进行自我连接,并且映射器永远不会减少。我为第1阶段获得8192个映射器和0个减速器。阶段1中的映射器永远不会完成并且出错。
CREATE TABLE test
(
site_id BIGINT,
user_id BIGINT
)
CLUSTERED BY (user_id)
SORTED BY (user_id)
INTO 8192 BUCKETS;
SET hive.enforce.bucketing = true;
SET hive.enforce.sorting = true;
INSERT OVERWRITE TABLE test
SELECT
distinct
u.uuid,
sd.slice_id
FROM table A u
--- MAPPER NEVER COMPLETES THE BELOW QUERY. EXPALIN PLAN SHOWS Sorted Merge Bucket Map Join Operator
SET hive.auto.convert.sortmerge.join=true;
SET hive.optimize.bucketmapjoin = true;
SET hive.optimize.bucketmapjoin.sortedmerge = true;
SET hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
SET hive.auto.convert.sortmerge.join.bigtable.selection.policy = org.apache.hadoop.hive.ql.optimizer.TableSizeBasedBigTableSelectorForAutoSMJ;
DROP TABLE IF EXISTS test;
CREATE TABLE test
AS
SELECT /*+ MAPJOIN(u2) */
u1.user_id,
u1.site_id as site_id_a,
u2.site_id as site_id_b
FROM test_tmp u1
join test_tmp u2
ON u1.user_id = u2.user_id
WHERE COALESCE(u1.site_id,0) <> COALESCE(u2.site_id,0) ;
提前致谢!