CREATE TABLE IF NOT EXISTS Table2
(
USER_ID BIGINT,
PURCHASED_ITEM ARRAY<STRUCT<PRODUCT_ID: BIGINT,TIMESTAMPS:STRING>>
) ROW FORMAT
DELIMITED FIELDS TERMINATED BY '-'
collection items terminated by ','
map keys terminated by ':'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/rj/output2';
以下是表2中的数据
1345653-110909316904:1341894546,221065796761:1341887508
我可以使用下面的查询来展开上述数据,它可以正常使用以上数据 -
SELECT * FROM (select user_id, prod_and_ts.product_id as product_id,
prod_and_ts.timestamps as timestamps FROM table2 LATERAL VIEW
explode(purchased_item) exploded_table as prod_and_ts) prod_and_ts;
我会得到这样的输出,这很好 -
1345653 110909316904 1341894546
1345653 221065796761 1341887508
但在某些情况下,我在下表中有这样的数据,多个时间戳附加了相同product_id的井号 -
1345653-110909316904:1341894546#1341885695,221065796761:1341887508#1341885453
我需要使用HiveQL查询输出这样的输出 -
1345653 110909316904 1341894546
1345653 110909316904 1341885695
1345653 221065796761 1341887508
1345653 221065796761 1341885453
这可能以某种方式这样做吗?
任何建议都将不胜感激。?
P.S我几天前问这个问题,但在那种情况下数据不同,现在数据完全不同,我需要类似的输出。答案 0 :(得分:4)
您可以使用函数regexp_replace或regex_extract来仅获取产品ID。试试这个:
SELECT * FROM (select user_id, prod_and_ts.product_id as product_id,
regex_replace(prod_and_ts.timestamps, "#\\d*", "") as timestamps FROM table2 LATERAL VIEW
explode(purchased_item) exploded_table as prod_and_ts) prod_and_ts;