我为每个客户提供了两个项目列表,ItemListA和ItemListB
Customer_id ItemListA ItemListB
24 2,3 3,4,5
26 6,7 8,9,10
25 4,5 5,8
我希望ItemListB中的Item不在ItemListA中,因此输出应为:
Customer_id ItemListB_A
24 4,5
26 8,9,10
25 4,8
我不知道如何从Hive中的两个字符串中减去项目。我知道COLLECT_SET但它可以删除重复但不删除项目交集。
答案 0 :(得分:0)
这将产生预期的结果。
select Customer_id, collect_list(y) from
(select Customer_id, y , count(*) as cnt from
(select Customer_id,y from (select Customer_id, split(concat_ws(',',ItemListA,ItemListB),',') as x from table_name) temp lateral view explode(x) temp as y) temp1
group by Customer_id,y ) temp2 where cnt =1 group by Customer_id;
答案 1 :(得分:0)
select customer_id
,split
(
regexp_replace
(
concat(ItemListB,',:,',ItemListA)
,'(?<=^|,)(?<item>.*?),(?=.*(?<=,)\\k<item>(?=,|$))'
,''
)
,',?:'
)[0] as ItemListB_A
from mytable
+-------------+-------------+
| customer_id | itemlistb_a |
+-------------+-------------+
| 24 | 4,5 |
| 26 | 8,9,10 |
| 25 | 8 |
+-------------+-------------+
答案 2 :(得分:0)
select customer_id
,concat_ws (',',collect_list (item)) as ItemListB_A
from (select t.customer_id
,pe.item
from mytable t
lateral view posexplode (split (concat_ws(',',ItemListA,ItemListB),',')) pe as pos,item
group by t.customer_id
,pe.item
having count (case when pos < size(split(ItemListA,',')) then 1 end) = 0
) t
group by customer_id
+-------------+-------------+
| customer_id | itemlistb_a |
+-------------+-------------+
| 24 | 4,5 |
| 25 | 8 |
| 26 | 10,8,9 |
+-------------+-------------+
答案 3 :(得分:-1)
select a.Customer_id,concat_ws (',',collect_list (a.item)) as ItemListB_A from (
select t.Customer_id
,pe.item
from test_hive t
lateral view posexplode (split (concat_ws(',',ItemListB),',')) pe as pos,item
lateral view posexplode (split (concat_ws(',',ItemListA),',')) pe1 as pos1,item1
group by t.Customer_id,pe.item
having count(case when item=item1 then 1 end)=0
)a
group by a.Customer_id