如何在字符串中找到项目的并集

时间:2017-05-31 21:50:16

标签: sql hadoop hive

我有两个表与第一个表 表1

Customer_id   Item_Id
   23           1
   23           2 
   23           1
   24           5
   24           3
   25           4

表2

 Item_Id      Connected_Items
    1             2,3,4
    2             1,3,6
    3             5
    4             8,7
    5             2,3

我想要的输出是连接到客户购买物品的所有物品的并集,从而删除客户从连接物品购买的所有重复物品和物品

Customer_id    Connected_Items
     23            3,4,6
     24            2
     25            8,7

我所做的是先扣除Customer_id和Item_Id,然后将其加入表2中。

 SELECT
        a.customer_id,
        b.connected_items
    FROM (
        SELECT 
           customer_id,
           item_id
        FROM
           Table 1
        GROUP BY 
           customer_id,
           item_id) a
     JOIN
         Table 2 b
     ON
         a.Item_id = b.Item_id

我得到的输出是

    23       1,2,3,3,4,6
    24        2,3,5
    25         8,7

如何删除客户已购买的商品并从connected_items列表中删除重复商品?

4 个答案:

答案 0 :(得分:0)

您可以在sql server中执行以下操作。

with customdata AS (

SELECT
t1.Customer_id ,t2.Item_Id ,t2.Connected_items 
FROM table1 AS t1 inner join table2 as t2 on t1.Item_Id  = t2.Item_Id 
)
,
 rowData AS  (

    SELECT F1.Customer_id,
 F1.Item_Id,
 O.splitdata 
FROM
 (
 SELECT *,
 cast('<X>'+replace(F.Connected_items,',','</X><X>')+'</X>' as XML) as xmlfilter from customdata F
 )F1
 CROSS APPLY
 ( 
 SELECT fdata.D.value('.','varchar(50)') as splitdata 
 FROM f1.xmlfilter.nodes('X') as fdata(D)) O
),
UniqueSet AS 
(

        SELECT
            rd.Customer_id,rd.Item_Id
        FROM  rowData AS rd
        GROUP BY rd.Customer_id,rd.Item_Id
),
FinalSet AS 
(

    SELECT
            DISTINCT aa.Customer_id,aa.splitdata 
        FROM  rowData AS aa
        WHERE NOT EXISTS (SELECT  * FROM UniqueSet as us WHERE us.Customer_id = aa.Customer_id  AND us.Item_Id = aa.splitdata )
)



SELECT DISTINCT   p.Customer_id,
  STUFF((SELECT distinct ',' + p1.splitdata
         FROM FinalSet p1
         WHERE p.Customer_id = p1.Customer_id
            FOR XML PATH(''), TYPE
            ).value('.', 'NVARCHAR(MAX)')
        ,1,1,'') Color
FROM FinalSet p;

答案 1 :(得分:0)

使用以下查询从配置单元中的列表中删除重复项。

select Customer_id, concat_ws(',',collect_list(ci)) from
(select Customer_id, ci , count(*) as cnt from 
(select Customer_id, ci from table_name t lateral view explode(split(Connected_Items,',')) temp as ci) temp1 
group by Customer_id,ci  ) temp2 where cnt =1 group by Customer_id;

答案 2 :(得分:0)

select      t1.customer_id

           ,split
            (
                regexp_replace
                (
                    concat_ws
                    (
                        ','
                       ,concat_ws (',',collect_set (t2.connected_items))
                       ,':'
                       ,concat_ws (',',collect_set (string(t1.item_id)))
                    )
                   ,'(?<=^|,)(?<item>.*?),(?=.*(?<=,)\\k<item>(?=,|$))'
                   ,''
                )
               ,',?:'
            )[0]        as Connected_Items

from                    Table1  as t1

            join        Table2  as t2

            on          t2.item_id  =
                        t1.item_id

group by    t1.customer_id
+-------------+-----------------+
| customer_id | connected_items |
+-------------+-----------------+
|          23 | 4,3,6           |
|          24 | 2               |
|          25 | 8,7             |
+-------------+-----------------+

答案 3 :(得分:0)

select      customer_id
           ,concat_ws(',',collect_list (string(item_id)))   as connected_items

from       (select      customer_id
                       ,item_id

            from       (select      1                   as tab
                                   ,customer_id
                                   ,item_id   

                        from        Table1

                        union all

                        select      2                       as tab
                                   ,t1.customer_id
                                   ,t2.connected_item_id    as item_id   

                        from                    Table1  as t1

                                    join       (select  t2.item_id
                                                       ,e.connected_item_id

                                                from    Table2 t2
                                                        lateral view explode(split(connected_items,',')) e as connected_item_id
                                                ) t2

                                    on          t2.item_id  =
                                                t1.item_id
                        ) t                   

            group by    customer_id
                       ,item_id

            having      min(case when tab = 1 then 1 end) is null
            ) t

group by    customer_id  
+-------------+-----------------+
| customer_id | connected_items |
+-------------+-----------------+
|          23 | 3,4,6           |
|          24 | 2               |
|          25 | 7,8             |
+-------------+-----------------+