我正在创建一个具有客户ID,姓名和配偶名称的外部表客户。
CREATE TABLE customer(cust id, name struct<fname:string,lname:string>,spouse_name struct<fname:string,lname:string>
)row format delimited
fields terminated by ','
collection items terminated by '$';
我想知道如果传入的数据是这样的
1,FNAME1$LNAME1,SPOUSE_FNAME1#SPOUSE_LNAME1
2,FNAME2$LNAME2,SPOUSE_FNAME2#SPOUSE_LNAME2
我不能在'collection items'语句中有两个分隔符。 '$'分隔符只会将FNAME *和LNAME *分开。它不会对SPOUSE_FNAME *和SPOUSE_LNAME *执行任何操作。我们需要为此写一个自定义serde吗?我不确定数据在现实世界中会是什么样子,但很可能在某些时候我们可以获得这样的数据。
答案 0 :(得分:0)
一种可能的方法是将结构体加载为简单字符串并在视图中进行数据操作。
create external table customer
(
cust_id int
,name string
,spouse_name string
)
row format delimited
fields terminated by ','
;
select * from customer
;
+---------+---------------+-----------------------------+
| cust_id | name | spouse_name |
+---------+---------------+-----------------------------+
| 1 | FNAME1$LNAME1 | SPOUSE_FNAME1#SPOUSE_LNAME1 |
| 2 | FNAME2$LNAME2 | SPOUSE_FNAME2#SPOUSE_LNAME2 |
+---------+---------------+-----------------------------+
create view customer_v
as
select cust_id
,named_struct('fname',name[0] ,'lname',name[1]) as name
,named_struct('fname',spouse_name[0],'lname',spouse_name[1]) as spouse_name
from (select cust_id
,split(name,'\\$') as name
,split(spouse_name,'#') as spouse_name
from customer
) c
;
select * from customer_v
;
+---------+-------------------------------------+---------------------------------------------------+
| cust_id | name | spouse_name |
+---------+-------------------------------------+---------------------------------------------------+
| 1 | {"fname":"FNAME1","lname":"LNAME1"} | {"fname":"SPOUSE_FNAME1","lname":"SPOUSE_LNAME1"} |
| 2 | {"fname":"FNAME2","lname":"LNAME2"} | {"fname":"SPOUSE_FNAME2","lname":"SPOUSE_LNAME2"} |
+---------+-------------------------------------+---------------------------------------------------+
答案 1 :(得分:0)
试试这个
CREATE TABLE customer(cust_id int, name String, spouse_name string) row format delimited fields terminated by ',' stored as textfile;
load data inpath '<hdfs path of input file>' overwrite into table customer;
CREATE external TABLE customer_tmp(cust_id int, name string,spouse_name string)
row format delimited
fields terminated by ','
stored as textfile location '/hdfs_location_of_customer_tmp';
insert overwrite table customer_tmp
select cust_id,regexp_replace(name,'\\W\\b',':') as name,regexp_replace(spouse_name,'\\W\\b',':') as spouse_name from customer;
CREATE TABLE customer_final(cust_id int, name struct<fname:string,lname:string>,spouse_name struct<fname:string,lname:string>)
row format delimited
fields terminated by ','
collection items terminated by ':'
stored as textfile;
load data inpath '/hdfs_location_of_customer_tmp/*' overwrite into table customer_final;
请不要忘记告诉我们是否有效:)