HIVE在最近的日期离开了

时间:2017-04-14 18:41:33

标签: date hadoop join hive hiveql

我正在尝试使用密钥加入HIVE中的2个表,并在加入时使用2个表中最近的日期。例如:下面是2个输入表

<----------TABLE A------------->            <------------TABLE B------------>
A_id    A_date      changed_col             B_id    B_date      B_value A_id
****    ******      ***********             ****    ******      ******* *****   
A01     2017-03-20      ABC                 B01     2017-04-02  200     A01
A01     2017-04-01      XYZ                 B01     2017-04-04  500     A01
A01     2017-04-05      LLL             

但是,当我将表B与表A联系起来时,它应该在表A中查找相同键(A_id)中最接近的最低日期。以下是预期的输出表:

B_id    B_date          A_id        A_date      changed col   B_value
****    ******          ****        ******      ***********   *******
B01     2017-02-04      A01     2017-01-04      XYZ             200
B01     2017-04-04      A01     2017-01-04      XYZ             500

非常感谢任何帮助。感谢

4 个答案:

答案 0 :(得分:2)

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_date
       ,A.changed_col
       ,B_value

from                B

        left join  (select  *

                    from   (select  B.B_id
                                   ,A.A_date
                                   ,A.changed_col

                                   ,row_number () over
                                    (
                                        partition by    B.B_id
                                        order by        A.A_date desc
                                    ) as rn
                            from            B
                                    join    A
                                    on      A.A_id = B.A_id
                            where   A.A_date <= B.B_date 
                            ) A

                    where rn = 1
                    ) A

        on          A.B_id  =
                    B.B_id
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+

答案 1 :(得分:1)

select  B_id
       ,dt                  as B_date
       ,A_id
       ,A_data.A_date       as A_date 
       ,A_data.changed_col
       ,B_value

from   (select  B_id,dt,B_value,A_id,tab               
               ,max 
                (   case 
                        when tab = 'A' 
                        then named_struct ('A_date',dt,'changed_col',changed_col) 
                    end
                ) over
                (
                    partition by    A_id
                    order by        dt,tab
                    rows            between unbounded preceding
                                    and     current row
                ) as A_data


        from   (select  B_id,B_date as dt,B_value,A_id
                       ,'B' as tab,null as changed_col
                from    B                

                union all

                select  null as B_id,A_date as dt,null as B_value,A_id
                       ,'A' as tab,changed_col
                from    A
                ) t
        ) t

where   tab = 'B'
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+            

答案 2 :(得分:1)

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_data.A_date         as A_date 
       ,A.A_data.changed_col    as changed_col
       ,B_value

from                B

        left join  (select      B.B_id
                               ,max (named_struct ('A_date',A_date,'changed_col',changed_col)) as A_data

                    from                B
                                join    A
                                on      A.A_id = B.A_id

                    where       A.A_date <= B.B_date 

                    group by    B.B_id
                    ) A

        on          A.B_id  =
                    B.B_id
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+

答案 3 :(得分:-1)

另一种选择是:

select B.B_id, B.B_date, B.A_id, A1.A_id, A1.A_date, A1.changed_col, B.B_value
from 
TABLE_B as B,
(
select A_id, A_date, chaged_col
from TABLE_A as A
where A_date = (select min(A_date) from TABLE_A where A_id = A.A_id) ) as A1
where B.A_id = A.A_id