我正在尝试使用hivexmlserde-1.0.5.3.jar在复杂的XML文件之上创建一个hive表。我可以在Internet上的示例中使它适用于XML,但我的XML文件却无法使用它,这似乎比我在网上找到的示例还要复杂。
这是我的XML文件:
<?xml version="1.0"?>
<SSNExportDocument xmlns="urn:com:ssn:schema:export:SSNExportFormat.xsd" Version="0.1" DocumentID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e-2" ExportID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e" JobID="164771" RunID="3456662" CreationTime="2019-07-29T13:15:09.584-05:00" StartTime="2019-07-29T09:15:00.000-05:00" EndTime="2019-07-29T13:15:00.000-05:00">
<MeterData MeterName="50000010" UtilDeviceID="50000010" MacID="a0:06:5f:00:00:00:00:0a">
<IntervalReadData IntervalLength="15" StartTime="2019-07-29T08:00:00.000-05:00" EndTime="2019-07-29T12:00:00.000-05:00" NumberIntervals="16">
<Interval EndTime="2019-07-29T08:15:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:46.302-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="29">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
<Interval EndTime="2019-07-29T08:30:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:46.302-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="30">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
</IntervalReadData>
</MeterData>
<MeterData MeterName="50000022" UtilDeviceID="50000022" MacID="a0:06:5f:00:00:00:00:16">
<IntervalReadData IntervalLength="15" StartTime="2019-07-29T08:00:00.000-05:00" EndTime="2019-07-29T12:00:00.000-05:00" NumberIntervals="16">
<Interval EndTime="2019-07-29T08:15:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:49.324-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="29">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
<Interval EndTime="2019-07-29T08:30:00.000-05:00" GatewayCollectedTime="2019-07-29T12:06:49.324-05:00" BlockSequenceNumber="181" IntervalSequenceNumber="30">
<Reading Channel="20" RawValue="5625.0" Value="3.3750" UOM="kWh(del)" BlockEndValue="0"/>
<Reading Channel="30" RawValue="5625.0" Value="3375.0000" UOM="kWh(rec)" BlockEndValue="0"/>
<Reading Channel="101" RawValue="5625.0" Value="399.9999" UOM="V Ph(A-N)" BlockEndValue="0"/>
<Reading Channel="102" RawValue="5625.0" Value="399.9999" UOM="V Ph(B-N)" BlockEndValue="0"/>
<Reading Channel="103" RawValue="5625.0" Value="399.9999" UOM="I A max" BlockEndValue="0"/>
<Reading Channel="104" RawValue="5625.0" Value="399.9999" UOM="I B max" BlockEndValue="0"/>
<Reading Channel="50" RawValue="5625.0" Value="3.3750" UOM="kVARh(del)" BlockEndValue="0"/>
</Interval>
</IntervalReadData>
</MeterData>
</SSNExportDocument>
我首先在http://dennysjymbo.blogspot.com/2018/05/using-xml-serde-in-hive-for-exploding.html上找到了可以正常工作的示例代码,所以我知道在类路径中找到了Serde。
我尝试了无数种方法来定义表。
这是我最近的尝试:
drop table if exists default.xmltest;
create external table default.xmltest(
MeterData array<
struct<MeterData:array<
struct<MeterName:string,UtilDeviceID:string,MacID:string,
IntervalReadData:struct<IntervalLength:string,StartTime:string,EndTime:string,NumberIntervals:string,
Interval:array<
struct<EndTime:string,GatewayCollectedTime:string,BlockSequenceNumber:string,IntervalSequenceNumber:string,
Reading:array<
struct<Channel:string,RawValue:string,Value:string,UOM:string,BlockEndValue:string
>>>>>>>>>)
row format serde 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
with serdeproperties ("column.xpath.MeterData" = "/SSNExportDocument/MeterData")
stored as inputformat 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
outputformat 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
location "/user/cyelve1/xmltest"
tblproperties ( "xmlinput.start" = "<SSNExportDocument>" ,"xmlinput.end" = "</SSNExportDocument>" );
注意:如果我尝试使用具有类似在其他示例中看到的属性的开始标签:
tblproperties ( "xmlinput.start" = "<SSNExportDocument " ,"xmlinput.end" = "</SSNExportDocument>" )
我得到一个NULL返回。因此,为了测试,我更改了开始标记
FROM
<SSNExportDocument xmlns="urn:com:ssn:schema:export:SSNExportFormat.xsd" Version="0.1" DocumentID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e-2" ExportID="eef9c8c5-0fc5-485b-bf05-7324917a7f5e" JobID="164771" RunID="3456662" CreationTime="2019-07-29T13:15:09.584-05:00" StartTime="2019-07-29T09:15:00.000-05:00" EndTime="2019-07-29T13:15:00.000-05:00">
TO
<SSNExportDocument>
我期望得到与http://dennysjymbo.blogspot.com/2018/05/using-xml-serde-in-hive-for-exploding.html网站所用示例中的结果相同的结果:
[{"customerleveldata":{"survey_id":144434840,"client_id":6780,"service":"HH","recdate":"2018-04-02","disdate":"2018-01-01","analysis":[{"response":{"varname":"B2PR","value":"5"}},{"response":{"varname":"PI2PR","value":"5"}}],"demographics":[{"response":{"varname":"AGE","value":"90"}},{"response":{"varname":"CMSH_1","value":"Yes"}}],"hcahps":[{"response":{"varname":"CMSH_10","value":"Yes"}},{"response":{"varname":"CMSH_12","value":"Yes"}}]}}]
但这是我的结果:
[{"meterdata":[{"metername":null,"utildeviceid":null,"macid":null,"intervalreaddata":{"intervallength":null,"starttime":null,"endtime":"<string>2019-07-29T08:15:00.000-05:002019-07-29T08:30:00.000-05:00</string>","numberintervals":null,"interval":[{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]}]}}]},{"meterdata":[{"metername":null,"utildeviceid":null,"macid":null,"intervalreaddata":{"intervallength":null,"starttime":null,"endtime":"<string>2019-07-29T08:15:00.000-05:002019-07-29T08:30:00.000-05:00</string>","numberintervals":null,"interval":[{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]},{"endtime":null,"gatewaycollectedtime":null,"blocksequencenumber":null,"intervalsequencenumber":null,"reading":[]}]}}]}]