我正在尝试从hive外部表中选择多个元素 我的XML:
<?xml version="1.0" encoding="UTF-8"?>
<data>
<units version="1">
<unit>
<id>1</id>
<name>ABC</name>
<details>
<detail>
<subid>001</subid>
</detail>
</details>
</unit>
<unit>
<id>2</id>
<name>DEF</name>
<details>
<detail>
<subid>002</subid>
</detail>
</details>
</unit>
</units>
</data>
我的表脚本是这样的:
create external table testxml (
id array<string>,
name array<string>,
subid array<string>
)
row format serde "com.ibm.spss.hive.serde2.xml.XmlSerDe"
with serdeproperties (
"column.xpath.id"="/units/unit/id/text()",
"column.xpath.name"="/units/unit/name/text()",
"column.xpath.subid"="/units/unit/details/detail/subid/text()"
)
stored as inputformat "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
outputformat "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
location "somepath/test"
tblproperties (
"xmlinput.start"="<units version",
"xmlinput.end"="</units>"
);
我的select语句返回以下值:
hive> select * from testxml;
OK
["1","2"] ["ABC","DEF"] ["001","002"]
但我想要的输出应该是这样的:
+--+------+------+
|id| name| subid|
+--+------+------+
| 1| ABC| 001|
| 2| DEF| 002|
+--+------+------+
任何反馈都会非常有用。
答案 0 :(得分:1)
试试这个
create external table testxml (
id string,
name string,
subid string
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
with serdeproperties (
"column.xpath.id"="/unit/id/text()",
"column.xpath.name"="/unit/name/text()",
"column.xpath.subid"="/unit/details/detail/subid/text()"
)
stored as inputformat "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
outputformat "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
location "file:///home/cloudera/xmlfiles2"
tblproperties (
"xmlinput.start"="<unit>",
"xmlinput.end"="</unit>"
);
INFO : OK
+-------------+---------------+----------------+--+
| testxml.id | testxml.name | testxml.subid |
+-------------+---------------+----------------+--+
| 1 | ABC | 001 |
| 2 | DEF | 002 |
+-------------+---------------+----------------+--+
2 rows selected (0.154 seconds)