我创建了一个包含struct
数组的表tempcreate table temp (regionkey smallint, name string, comment string, nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ',';
然后我将数据加载到表
中LOAD DATA LOCAL INPATH '/Data Sets/region.csv' INTO TABLE temp;
时的所需输出
select * from temp;
是
4 EUROPE Low sale Business Region [{"n_nationkey":22,"n_name":"Ryan","n_comment":"Reference the site"}]
但实际输出是
4 EUROPE Low sale Business Region [{"n_nationkey":22,"n_name":null,"n_comment":null},{"n_nationkey":null,"n_name":null,"n_comment":null},{"n_nationkey":null,"n_name":null,"n_comment":null}]
数据文件
4|EUROPE|Low sale Business Region for Training4Exam.com|7,Bulgaria,Reference
4|EUROPE|Low sale Business Region for HadoopExam.com|19,Belgium,Reference site
4|EUROPE|Low sale Business Region for Training4Exam.com|22,Ryan,Reference site
这是我对阵列和结构的第一次考试,我对此感到茫然。 任何帮助与高度赞赏。 感谢
答案 0 :(得分:1)
<强> map keys terminated by ','
强>
create external table temp
(
regionkey smallint
,name string
,comment string
,nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>
)
row format delimited
fields terminated by '|'
map keys terminated by ','
;
select * from temp
;
+-----------+--------+------------------------------------------------+-----------------------------------------------------------------------+
| regionkey | name | comment | nations |
+-----------+--------+------------------------------------------------+-----------------------------------------------------------------------+
| 4 | EUROPE | Low sale Business Region for Training4Exam.com | [{"n_nationkey":7,"n_name":"Bulgaria","n_comment":"Reference "}] |
| 4 | EUROPE | Low sale Business Region for HadoopExam.com | [{"n_nationkey":19,"n_name":"Belgium","n_comment":"Reference site "}] |
| 4 | EUROPE | Low sale Business Region for Training4Exam.com | [{"n_nationkey":22,"n_name":"Ryan","n_comment":"Reference site"}] |
+-----------+--------+------------------------------------------------+-----------------------------------------------------------------------+
要向后兼容,请将前3个分隔符初始化为表属性中的给定值 默认的分隔符数为8;
如果仅设置了hive.serialization.extend.nesting.levels,则分隔符的数量将扩展为24;
如果设置了hive.serialization.extend.additional.nesting.levels,则分隔符的数量将扩展到154.
@param tableProperties表属性,用于提取用户提供的分隔符
答案 1 :(得分:0)
David 的回答非常有效,我非常喜欢它,但无法理解为什么必须用映射键替换集合项(根据他所建议的描述,Hive 中似乎存在一个错误,我不是编码专家).
不过,这是长版
create table regiontemp(str string);
load data inpath '/user/cloudera/MohsenFiles/first_first.csv' into table regiontemp;
create external table region (r_regionkey smallint,
r_name string,
r_comment string,
r_nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>)
row format delimited
fields terminated by '|'
collection items terminated by ','
insert overwrite table region
select split(str,'\\|')[0] r_regionkey,
split(str,'\\|')[1] r_name,
split(str,'\\|')[2] r_comment,
array(named_struct("n_nationkey",cast(split(split(str,'\\|')[3],",")[0] as smallint),
"n_name",split(split(str,'\\|')[3],",")[1] ,
"n_comment",split(split(str,'\\|')[3],",")[2] ))
from regiontemp ;
现在在黑斑羚 使元数据无效;
或在 Hive 中(Aggregation On Struct columns Hive 再次基于 David 对另一个 Q 的回答)