问题陈述
我的文件如下,模式为Event_Time,AD_id
file_20170102-May have records with event_time for 20170101,20170102,20170103
file_20170103-May have records with event_time for 20170102,20170103,20170104
此处事件时间是事件发生的时间,文件名的时间戳是收集事件的时间。因此文件名的时间戳和文件中的event_time不同步。
当我将这些数据写入hive时,我肯定需要根据event_time分区写入数据,因为用户对基于event_time的查询感兴趣。
所以我的看法如下:
/path/to/output/event_time=20170102/....parquet
/path/to/output/event_time=20170103/....parquet
但是我需要能够跟踪文件时间戳,因为有时文件被重新发布,我们想根据文件时间戳删除已处理的文件。
有没有办法可以写这个 /路径/到/输出/ EVENT_TIME =二千零十七万零二百零二分之二千零十七万零一百零一(file_tiemstamp)
请注意,在上面的20170102(file_timestamp)中是一个目录而不是hive分区。
或者我可以控制镶木地板文件的名称,所以当我想删除文件名时,很容易找出要删除的文件
答案 0 :(得分:0)
/home/dmarkovitz/myfiles
myfile_1_20161204.csv
20161204,1
20161203,2
myfile_2_20161205.csv
20161203,3
20161204,4
20161205,5
20161203,6
myfile_3_20161205.csv
20161205,7
20161205,8
20161203,9
<强>蜂房强>
create external table myfiles
(
Event_Time string
,AD_id int
)
row format delimited
fields terminated by ','
stored as textfile
location 'file:///home/dmarkovitz/myfiles'
;
select *
,input__file__name
from myfiles
;
+------------+-------+-----------------------------------------------------+
| event_time | ad_id | input__file__name |
+------------+-------+-----------------------------------------------------+
| 20161204 | 1 | file:/home/dmarkovitz/myfiles/myfile_1_20161204.csv |
| 20161203 | 2 | file:/home/dmarkovitz/myfiles/myfile_1_20161204.csv |
| 20161205 | 7 | file:/home/dmarkovitz/myfiles/myfile_3_20161205.csv |
| 20161205 | 8 | file:/home/dmarkovitz/myfiles/myfile_3_20161205.csv |
| 20161203 | 9 | file:/home/dmarkovitz/myfiles/myfile_3_20161205.csv |
| 20161203 | 3 | file:/home/dmarkovitz/myfiles/myfile_2_20161205.csv |
| 20161204 | 4 | file:/home/dmarkovitz/myfiles/myfile_2_20161205.csv |
| 20161205 | 5 | file:/home/dmarkovitz/myfiles/myfile_2_20161205.csv |
| 20161203 | 6 | file:/home/dmarkovitz/myfiles/myfile_2_20161205.csv |
+------------+-------+-----------------------------------------------------+
create table mytable
(
AD_id int
)
partitioned by (file_dt date,Event_Time date)
stored as parquet
;
set hive.exec.dynamic.partition.mode=nonstrict;
insert into mytable partition (file_dt,Event_Time)
select ad_id
,from_unixtime(unix_timestamp(split(input__file__name,'[_.]')[2],'yyyyMMdd'),'yyyy-MM-dd')
,from_unixtime(unix_timestamp(Event_Time,'yyyyMMdd'),'yyyy-MM-dd')
from myfiles
;
show partitions mytable
;
+------------------------------------------+
| partition |
+------------------------------------------+
| file_dt=2016-12-04/event_time=2016-12-03 |
| file_dt=2016-12-04/event_time=2016-12-04 |
| file_dt=2016-12-05/event_time=2016-12-03 |
| file_dt=2016-12-05/event_time=2016-12-04 |
| file_dt=2016-12-05/event_time=2016-12-05 |
+------------------------------------------+
select *
,input__file__name
from mytable
;
+-------+------------+------------+----------------------------------------------------------------------+
| ad_id | file_dt | event_time | input__file__name |
+-------+------------+------------+----------------------------------------------------------------------+
| 2 | 2016-12-04 | 2016-12-03 | file:/mydb/mytable/file_dt=2016-12-04/event_time=2016-12-03/000000_0 |
| 1 | 2016-12-04 | 2016-12-04 | file:/mydb/mytable/file_dt=2016-12-04/event_time=2016-12-04/000000_0 |
| 9 | 2016-12-05 | 2016-12-03 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-03/000000_0 |
| 3 | 2016-12-05 | 2016-12-03 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-03/000000_0 |
| 6 | 2016-12-05 | 2016-12-03 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-03/000000_0 |
| 4 | 2016-12-05 | 2016-12-04 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-04/000000_0 |
| 7 | 2016-12-05 | 2016-12-05 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-05/000000_0 |
| 8 | 2016-12-05 | 2016-12-05 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-05/000000_0 |
| 5 | 2016-12-05 | 2016-12-05 | file:/mydb/mytable/file_dt=2016-12-05/event_time=2016-12-05/000000_0 |
+-------+------------+------------+----------------------------------------------------------------------+
explain dependency
select *
from mytable
where Event_Time = date '2016-12-04'
;
{&#34; input_tables&#34;:[{&#34;表名&#34;:&#34; local_db @ MYTABLE&#34;&#34; TABLETYPE&#34;:&#34; MANAGED_TABLE&# 34;}],&#34; input_partitions&#34;:[{&#34;分区名&#34;:&#34; local_db @ MYTABLE @ file_dt = 2016年12月4日/ EVENT_TIME = 2016年12月4日&#34 ;},{&#34;分区名&#34;:&#34; local_db @ MYTABLE @ file_dt = 2016年12月5日/ EVENT_TIME = 2016年12月4日&#34;}]}
<强>的bash 强>
tree mytable
mytable
├── file_dt=2016-12-04
│ ├── event_time=2016-12-03
│ │ └── 000000_0
│ └── event_time=2016-12-04
│ └── 000000_0
└── file_dt=2016-12-05
├── event_time=2016-12-03
│ └── 000000_0
├── event_time=2016-12-04
│ └── 000000_0
└── event_time=2016-12-05
└── 000000_0