我正在尝试使用密钥加入HIVE中的2个表,并在加入时使用2个表中最近的日期。例如:下面是2个输入表
<----------TABLE A-------------> <------------TABLE B------------>
A_id A_date changed_col B_id B_date B_value A_id
**** ****** *********** **** ****** ******* *****
A01 2017-03-20 ABC B01 2017-04-02 200 A01
A01 2017-04-01 XYZ B01 2017-04-04 500 A01
A01 2017-04-05 LLL
但是,当我将表B与表A联系起来时,它应该在表A中查找相同键(A_id)中最接近的最低日期。以下是预期的输出表:
B_id B_date A_id A_date changed col B_value
**** ****** **** ****** *********** *******
B01 2017-02-04 A01 2017-01-04 XYZ 200
B01 2017-04-04 A01 2017-01-04 XYZ 500
非常感谢任何帮助。感谢
答案 0 :(得分:2)
select B.B_id
,B.B_date
,B.A_id
,A.A_date
,A.changed_col
,B_value
from B
left join (select *
from (select B.B_id
,A.A_date
,A.changed_col
,row_number () over
(
partition by B.B_id
order by A.A_date desc
) as rn
from B
join A
on A.A_id = B.A_id
where A.A_date <= B.B_date
) A
where rn = 1
) A
on A.B_id =
B.B_id
+------+------------+------+------------+-------------+---------+
| b_id | b_date | a_id | a_date | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01 | 2017-04-02 | A01 | 2017-04-01 | XYZ | 200 |
| B01 | 2017-04-04 | A01 | 2017-04-01 | XYZ | 500 |
+------+------------+------+------------+-------------+---------+
答案 1 :(得分:1)
select B_id
,dt as B_date
,A_id
,A_data.A_date as A_date
,A_data.changed_col
,B_value
from (select B_id,dt,B_value,A_id,tab
,max
( case
when tab = 'A'
then named_struct ('A_date',dt,'changed_col',changed_col)
end
) over
(
partition by A_id
order by dt,tab
rows between unbounded preceding
and current row
) as A_data
from (select B_id,B_date as dt,B_value,A_id
,'B' as tab,null as changed_col
from B
union all
select null as B_id,A_date as dt,null as B_value,A_id
,'A' as tab,changed_col
from A
) t
) t
where tab = 'B'
+------+------------+------+------------+-------------+---------+
| b_id | b_date | a_id | a_date | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01 | 2017-04-02 | A01 | 2017-04-01 | XYZ | 200 |
| B01 | 2017-04-04 | A01 | 2017-04-01 | XYZ | 500 |
+------+------------+------+------------+-------------+---------+
答案 2 :(得分:1)
select B.B_id
,B.B_date
,B.A_id
,A.A_data.A_date as A_date
,A.A_data.changed_col as changed_col
,B_value
from B
left join (select B.B_id
,max (named_struct ('A_date',A_date,'changed_col',changed_col)) as A_data
from B
join A
on A.A_id = B.A_id
where A.A_date <= B.B_date
group by B.B_id
) A
on A.B_id =
B.B_id
+------+------------+------+------------+-------------+---------+
| b_id | b_date | a_id | a_date | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01 | 2017-04-02 | A01 | 2017-04-01 | XYZ | 200 |
| B01 | 2017-04-04 | A01 | 2017-04-01 | XYZ | 500 |
+------+------------+------+------------+-------------+---------+
答案 3 :(得分:-1)
另一种选择是:
select B.B_id, B.B_date, B.A_id, A1.A_id, A1.A_date, A1.changed_col, B.B_value
from
TABLE_B as B,
(
select A_id, A_date, chaged_col
from TABLE_A as A
where A_date = (select min(A_date) from TABLE_A where A_id = A.A_id) ) as A1
where B.A_id = A.A_id