在蜂巢中查找组合计数

时间:2017-02-22 05:44:30

标签: hadoop hive hiveql

我想使用HIVE

找到两列中的值组合计数

输入表:

+--------------+--------------------+
| Col1         |     Col2           |
+--------------+--------------------+
|  Sam         |     Ricky          |
|  Patel       |     Trump          |
|  Suzane      |     Robert         |
|  Ricky       |     Sam            |
|  Bob         |     Ricky          |
|  Robert      |     Suzane         |
+--------------+--------------------+

找到模式组合:

.............................
|   Sam      |      Ricky   |
|   Ricky    |      Sam     |
|   Suzane   |      Robert  |
|   Robert   |      Suzane  |
.............................

预期产出:

  Sam     Ricky  2
  Robert  Suzane 2

1 个答案:

答案 0 :(得分:2)

select      least    (col1, col2)   as least_col
           ,greatest (col1, col2)   as greatest_col
           ,count    (*)

from        mytable

group by    least    (col1, col2)
           ,greatest (col1, col2)

having     count (*) >= 2
;

演示

create table mytable (Col1 string,Col2 string);

insert into mytable values 
    ('Sam'    ,'Ricky' )
   ,('Patel'  ,'Trump' )
   ,('Suzane' ,'Robert')
   ,('Ricky'  ,'Sam'   )
   ,('Bob'    ,'Ricky' )
   ,('Robert' ,'Suzane')
;   
select * from mytable;
+--------------+--------------+
| mytable.col1 | mytable.col2 |
+--------------+--------------+
| Sam          | Ricky        |
| Patel        | Trump        |
| Suzane       | Robert       |
| Ricky        | Sam          |
| Bob          | Ricky        |
| Robert       | Suzane       |
+--------------+--------------+
select      col1
           ,col2
           ,least    (col1, col2)   as least_col
           ,greatest (col1, col2)   as greatest_col

from        mytable
;
+--------+--------+-----------+--------------+
|  col1  |  col2  | least_col | greatest_col |
+--------+--------+-----------+--------------+
| Sam    | Ricky  | Ricky     | Sam          |
| Patel  | Trump  | Patel     | Trump        |
| Suzane | Robert | Robert    | Suzane       |
| Ricky  | Sam    | Ricky     | Sam          |
| Bob    | Ricky  | Bob       | Ricky        |
| Robert | Suzane | Robert    | Suzane       |
+--------+--------+-----------+--------------+
select      least    (col1, col2)   as least_col
           ,greatest (col1, col2)   as greatest_col
           ,count    (*)

from        mytable

group by    least    (col1, col2)
           ,greatest (col1, col2)

having     count (*) >= 2
;
+-----------+--------------+-----+
| least_col | greatest_col | _c2 |
+-----------+--------------+-----+
| Robert    | Suzane       |   2 |
| Ricky     | Sam          |   2 |
+-----------+--------------+-----+