access_log文件中的数据,如:
in24.inetnebr.com - - [01 / Aug / 1995:00:00:01 -0400]“GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP / 1.0“200 1839
空格分隔值,所以我创建了表格架构,如
CREATE TABLE IF NOT EXISTS access_log (
host STRING,
identity STRING,
apache_user STRING,
time STRING,
request STRING,
status STRING,
size STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)", "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s" ) STORED AS TEXTFILE;
表已成功创建并使用以下命令加载数据:
LOAD DATA LOCAL INPATH '${env:HOME}/work/pocs/3.weblogs/access_log.txt' OVERWRITE INTO TABLE access_log;
select time from access_log limit 2;
我的时间结果是:
[01 / Aug / 1995:00:00:01 -0400]
[01 / Aug / 1995:00:00:07 -0400]
所以现在我想根据月份对数据进行分区,所以我正在尝试为hive promt中的分区数据创建新表:
CREATE TABLE IF NOT EXISTS access_log_partition (
host STRING,
identity STRING,
apache_user STRING,
request STRING,
status STRING,
size STRING
)
PARTITIONED BY (time string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s"
)
STORED AS TEXTFILE;
我为分区数据创建了新表(access_log_partition)之后,我想根据月份对数据进行分区,所以我正在验证日期格式以分区数据我在命令下面使用:
select date_format(to_utc_timestamp(from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')),'PST'),'MMM') as `mmm` from access_log limit 5;
所以结果如下:
行
月
月
月
月
月
月
月
月
月
月
所需时间:0.329秒,取得:10行
我想将数据从access_log分区到access_log_partition表,所以我尝试了以下命令:
INSERT OVERWRITE TABLE access_log_partition partition(time) select host, identity, apache_user, date_format(to_utc_timestamp(from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')),'PST'),'MMM') as `mmm`, request, status, size from access_log;
但我收到的错误如下:
此任务的诊断消息:
错误:java.lang.RuntimeException:配置对象时出错 org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:109)
所以请帮我根据月份分区数据..
答案 0 :(得分:0)
create external table access_log
(
host string
,identity string
,apache_user string
,time string
,request string
,status string
,size string
)
row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
with serdeproperties ('input.regex'='([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)')
stored as textfile
;
select * from access_log
;
+-------------------+----------+-------------+------------------------------+----------------------------------------------------------------+--------+------+
| host | identity | apache_user | time | request | status | size |
+-------------------+----------+-------------+------------------------------+----------------------------------------------------------------+--------+------+
| in24.inetnebr.com | - | - | [01/Aug/1995:00:00:01 -0400] | "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" | 200 | 1839 |
+-------------------+----------+-------------+------------------------------+----------------------------------------------------------------+--------+------+
create table if not exists access_log_partition
(
host string
,identity string
,apache_user string
,request string
,status string
,size string
)
partitioned by (time string)
stored as textfile
;
set hive.exec.dynamic.partition.mode=nonstrict:
insert into access_log_partition partition (time)
select host,identity,apache_user,request,status,size
,date_format(to_utc_timestamp(from_unixtime(unix_timestamp(time,'[dd/MMM/yyyy:HH:mm:ss z]')),'PST'),'MMM') as `mmm`
from access_log
;
select * from access_log_partition
;
+-------------------+----------+-------------+----------------------------------------------------------------+--------+------+------+
| host | identity | apache_user | request | status | size | time |
+-------------------+----------+-------------+----------------------------------------------------------------+--------+------+------+
| in24.inetnebr.com | - | - | "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" | 200 | 1839 | Aug |
+-------------------+----------+-------------+----------------------------------------------------------------+--------+------+------+