我在文本文件中有数据,如:
323.81.303.680 - - [25/Oct/2011:01:41:00 -0500] "GET /download/download6.zip HTTP/1.1" 200 0 "-" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.19) Gecko/2010031422 Firefox/3.0.19"
使用regexserde为上述数据创建表架构:
CREATE TABLE IF NOT EXISTS weblogs (
host STRING,
identity STRING,
apache_user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;
并正确加载数据
但问题是我需要根据每月提供分区数据
我得到时间:[25/Oct/2011:01:41:00 -0500]
为字符串。
我需要根据每月分区数据,上面的时间字段是字符串类型,所以请任何人帮我根据年份和月份对所有数据进行分区。
答案 0 :(得分:0)
select time
,unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')
,from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]'))
,to_utc_timestamp(from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')),'PST')
,date_format(to_utc_timestamp(from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')),'PST'),'YYYY-MM')
from weblogs
;
+------------------------------+---------------+---------------------+----------------------------+---------+
| time | c1 | c2 | c3 | c4 |
+------------------------------+---------------+---------------------+----------------------------+---------+
| [25/Oct/2011:01:41:00 -0500] | 1,319,524,860 | 2011-10-24 23:41:00 | 2011-10-25 06:41:00.000000 | 2011-10 |
+------------------------------+---------------+---------------------+----------------------------+---------+