使用SerDe将嵌套的XML数据加载到Hive中

时间:2017-05-25 12:47:26

标签: xml hive nested hive-serde

我正在尝试将嵌套的XML数据加载到Hive中。样本数据如下......

<CustomerOrders>
  <Customers>
    <CustID>ALFKI</CustID>
    <Orders>
      <OrderID>10643</OrderID>
      <CustomerID>ALFKI</CustomerID>
      <OrderDate>1997-08-25</OrderDate>
    </Orders>
    <Orders>
      <OrderID>10692</OrderID>
      <CustomerID>ALFKI</CustomerID>
      <OrderDate>1997-10-03</OrderDate>
    </Orders>
    <CompanyName>Alfreds Futterkiste</CompanyName>
  </Customers>
  <Customers>
    <CustID>ANATR</CustID>
    <Orders>
      <OrderID>10308</OrderID>
      <CustomerID>ANATR</CustomerID>
      <OrderDate>1996-09-18</OrderDate>
    </Orders>
    <CompanyName>Ana Trujillo Emparedados y helados</CompanyName>
  </Customers>
</CustomerOrders>

以下是我正在使用的命令:

CREATE TABLE CUSTOMERORDERS(
          CustID STRING,
          Orders ARRAY<STRUCT<OrderID:STRING,CustomerID:STRING,OrderDate:STRING>>,
          CompanyName STRING)
          ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
          WITH SERDEPROPERTIES (
          "column.xpath.CustID"="/Customers/CustID/text()",
          "column.xpath.Orders"="/Customers/Orders",
          "column.xpath.OrderID"="/Customers/Orders/OrderID",
          "column.xpath.CustomerID"="/Customers/Orders/CustomerID",
          "column.xpath.OrderDate"="/Customers/Orders/OrderDate",
          "column.xpath.CompanyName"="/Customers/CompanyName/text()")
          STORED AS INPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
          OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
          TBLPROPERTIES ("xmlinput.start"="<Customers>","xmlinput.end"= "</Customers>");

我得到的输出是:

    hive> select * from customerorders;
OK
ALFKI   [{"orderid":null,"customerid":null,"orderdate":null},{"orderid":null,"customerid":null,"orderdate":null}]       Alfreds Futterkiste
ANATR   [{"orderid":null,"customerid":null,"orderdate":null}]   Ana Trujillo Emparedados y helados
Time taken: 0.039 seconds, Fetched: 2 row(s)

我为nullOrderIDCustomerID获得OrderDate个值。任何人都可以帮我解决这个问题吗?

由于

1 个答案:

答案 0 :(得分:1)

create external table customerorders
(
    custid      string
   ,orders      array<struct<Orders:struct<OrderID:string,CustomerID:string,OrderDate:string>>>
   ,companyname string
)
row format serde 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
with serdeproperties 
(
    "column.xpath.CustID"       = "/Customers/CustID/text()"
   ,"column.xpath.Orders"       = "/Customers/Orders"
   ,"column.xpath.CompanyName"  = "/Customers/CompanyName/text()"
)

stored as 
inputformat     'com.ibm.spss.hive.serde2.xml.XmlInputFormat'
outputformat    'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'
tblproperties 
(
    "xmlinput.start"    = "<Customers>"
   ,"xmlinput.end"      = "</Customers>"
);

-

select * from customerorders
;

-

+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------+
| custid |                                                                           orders                                                                            |            companyname             |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------+
| ALFKI  | [{"orders":{"orderid":"10643","customerid":"ALFKI","orderdate":"1997-08-25"}},{"orders":{"orderid":"10692","customerid":"ALFKI","orderdate":"1997-10-03"}}] | Alfreds Futterkiste                |
| ANATR  | [{"orders":{"orderid":"10308","customerid":"ANATR","orderdate":"1996-09-18"}}]                                                                              | Ana Trujillo Emparedados y helados |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------+