将多行JSON数据加载到HIVE表

时间:2017-02-13 06:56:36

标签: json hadoop hive

我有一个JSON数据,它是一个多行JSON。我创建了一个hive表来将数据加载到其中。我有另一个JSON,它是一个单行JSON记录。当我将单行JSON记录加载到其hive表并尝试查询时,它工作正常。但是当我将多行JSON加载到其hive表中时,它给出了以下异常:

Failed with exception java.io.IOException:org.apache.hadoop.hive.serde2.SerDeExcep‌​tion: org.codehaus.jackson.JsonParseException: Unexpected end-of-input: expected close marker for OBJECT (from [Source: java.io.ByteArrayInputStream@8b89b3a; line: 1, column: 0]) at [Source: java.io.ByteArrayInputStream@8b89b3a; line: 1, column: 3] 

以下是我的JSON数据:

{
  "uploadTimeStamp" : "1486631318873",
  "PDID" : "123",
  "data" : [ {
    "Data" : {
      "unit" : "rpm",
      "value" : "0"
    },
    "EventID" : "E1",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  }, {
    "Data" : {
      "heading" : "N",
      "loc3" : "false",
      "loc" : "14.022425",
      "loc1" : "78.760587",
      "loc4" : "false",
      "speed" : "10"
    },
    "EventID" : "E2",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.1",
    "pii" : { }
  }, {
    "Data" : {
      "x" : "1.1",
      "y" : "1.2",
      "z" : "2.2"
    },
    "EventID" : "E3",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  }, {
    "EventID" : "E4",
    "Data" : {
      "value" : "50",
      "unit" : "percentage"
    },
    "Version" : "1.0",
    "Timestamp" : 1486631318873,
    "PDID" : "123",
    "Timezone" : 330
  }, {
    "Data" : {
      "unit" : "kmph",
      "value" : "70"
    },
    "EventID" : "E5",
    "PDID" : "123",
    "Timestamp" : 1486631318873,
    "Timezone" : 330,
    "Version" : "1.0",
    "pii" : { }
  } ]
}

我正在使用/hive/lib/hive-hcatalog-core-0.13.0.jar

以下是我的create table语句:

create table test7(
uploadtime bigint,
pdid string,
data array<
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
Location:string,
latitude:bigint,
longitude:bigint,
Location2:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
x:int,
y:int,
z:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
loc3:string,
latitude:bigint,
longitude:bigint,
loc4:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>
>
)
ROW FORMAT SERDE 
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/xyz/abc/';

编辑:

添加单行JSON和新表创建stmt,错误:

{"uploadTimeStamp":"1487183800905","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"event1","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc1":"false","latitude":"16.032425","longitude":"80.770587","loc2":"false","speed":"10"},"EventID":"event2","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"event3":"AccelerometerInfo","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"event4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1487183800905,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"event5","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}}]}
create table test1(
uploadTimeStamp string,
PDID string,
data array<struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<heading:string,loc1:string,latitude:double,longitude:double,loc2:string,speed:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<x:float,y:float,z:float>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
EventID:string,
Data:struct<value:int,unit:percentage>,
Version:float,
TimeS:bigint,
PDID:string,
Timezone:int>,
struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>
>
ROW FORMAT SERDE 
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION

&#39; / ABC / XYZ /&#39;;

MismatchedTokenException(320!=313)
...
...
...
FAILED: ParseException line 11:10 mismatched input '<>' expecting < near 'struct' in struct type

2 个答案:

答案 0 :(得分:1)

示例数据

{"uploadTimeStamp":"1486631318873","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"E1","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10"},"EventID":"E2","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"EventID":"E3","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"E4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1486631318873,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"E5","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}}]}
add jar /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar
create external table myjson
(
    uploadTimeStamp string
   ,PDID            string

   ,data            array
                    <
                        struct
                        <
                            Data:struct
                            <
                                unit:string
                               ,value:string
                               ,heading:string
                               ,loc3:string
                               ,loc:string
                               ,loc1:string
                               ,loc4:string
                               ,speed:string
                               ,x:string
                               ,y:string
                               ,z:string
                            >
                           ,EventID:string
                           ,PDID:string
                           ,`Timestamp`:bigint
                           ,Timezone:smallint
                           ,Version:string
                           ,pii:struct<dummy:string>
                        >
                    >
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' 
stored as textfile
location '/tmp/myjson'
;
select * from myjson
;

| myjson.uploadtimestamp | myjson.pdid |myjson.data|

|          1486631318873 |         123 | [{"data":{"unit":"rpm","value":"0","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E1","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10","x":null,"y":null,"z":null},"eventid":"E2","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.1","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":"1.1","y":"1.2","z":"2.2"},"eventid":"E3","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":"percentage","value":"50","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E4","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":null},{"data":{"unit":"kmph","value":"70","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E5","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}}] |

select  j.uploadTimeStamp
       ,j.PDID

       ,d.val.EventID
       ,d.val.PDID
       ,d.val.`Timestamp`
       ,d.val.Timezone
       ,d.val.Version

       ,d.val.Data.unit
       ,d.val.Data.value
       ,d.val.Data.heading
       ,d.val.Data.loc3
       ,d.val.Data.loc
       ,d.val.Data.loc1
       ,d.val.Data.loc4
       ,d.val.Data.speed
       ,d.val.Data.x
       ,d.val.Data.y
       ,d.val.Data.z

from    myjson  j
        lateral view  explode (data) d as val
;            
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| j.uploadtimestamp | j.pdid | eventid | pdid |   timestamp   | timezone | version |    unit    | value | heading | loc3  |    loc    |   loc1    | loc4  | speed |  x   |  y   |  z   |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
|     1486631318873 |    123 | E1      |  123 | 1486631318873 |      330 | 1.0     | rpm        | 0     | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
|     1486631318873 |    123 | E2      |  123 | 1486631318873 |      330 | 1.1     | NULL       | NULL  | N       | false | 14.022425 | 78.760587 | false | 10    | NULL | NULL | NULL |
|     1486631318873 |    123 | E3      |  123 | 1486631318873 |      330 | 1.0     | NULL       | NULL  | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | 1.1  | 1.2  | 2.2  |
|     1486631318873 |    123 | E4      |  123 | 1486631318873 |      330 | 1.0     | percentage | 50    | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
|     1486631318873 |    123 | E5      |  123 | 1486631318873 |      330 | 1.0     | kmph       | 70    | NULL    | NULL  | NULL      | NULL      | NULL  | NULL  | NULL | NULL | NULL |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+

答案 1 :(得分:-1)

有同样的问题,然后决定创建一个自定义输入格式,可以提取多行(漂亮的打印)json记录。

这个JsonRecordReader可以读取Hive中的多行JSON记录。它基于花括号的平衡来提取记录 - {和}。因此,第一个'{'到平衡的最后'}'之间的内容被视为一个完整的记录。以下是代码段:

public static class JsonRecordReader implements RecordReader<LongWritable, Text> {

    public static final String START_TAG_KEY = "jsoninput.start";
    public static final String END_TAG_KEY = "jsoninput.end";

    private byte[] startTag = "{".getBytes();
    private byte[] endTag = "}".getBytes();
    private long start;
    private long end;
    private FSDataInputStream fsin;
    private final DataOutputBuffer buffer = new DataOutputBuffer();

    public JsonRecordReader(FileSplit split, JobConf jobConf) throws IOException {
        // uncomment the below lines if you need to get the configuration
        // from JobConf:
        // startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
        // endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");

        // open the file and seek to the start of the split:
        start = split.getStart();
        end = start + split.getLength();
        Path file = split.getPath();
        FileSystem fs = file.getFileSystem(jobConf);
        fsin = fs.open(split.getPath());
        fsin.seek(start);
    }

    @Override
    public boolean next(LongWritable key, Text value) throws IOException {
        if (fsin.getPos() < end) {
            AtomicInteger count = new AtomicInteger(0);
            if (readUntilMatch(false, count)) {
                try {
                    buffer.write(startTag);
                    if (readUntilMatch(true, count)) {
                        key.set(fsin.getPos());
                        // create json record from buffer:
                        String jsonRecord = new String(buffer.getData(), 0, buffer.getLength());
                        value.set(jsonRecord);
                        return true;
                    }
                } finally {
                    buffer.reset();
                }
            }
        }
        return false;
    }

    @Override
    public LongWritable createKey() {
        return new LongWritable();
    }

    @Override
    public Text createValue() {
        return new Text();
    }

    @Override
    public long getPos() throws IOException {
        return fsin.getPos();
    }

    @Override
    public void close() throws IOException {
        fsin.close();
    }

    @Override
    public float getProgress() throws IOException {
        return ((fsin.getPos() - start) / (float) (end - start));
    }

    private boolean readUntilMatch(boolean withinBlock, AtomicInteger count) throws IOException {
        while (true) {
            int b = fsin.read();
            // end of file:
            if (b == -1)
                return false;

            // save to buffer:
            if (withinBlock)
                buffer.write(b);

            // check if we're matching start/end tag:
            if (b == startTag[0]) {
                count.incrementAndGet();
                if (!withinBlock) {
                    return true;
                }
            } else if (b == endTag[0]) {
                count.getAndDecrement();
                if (count.get() == 0) {
                    return true;
                }
            }

            // see if we've passed the stop point:
            if (!withinBlock && count.get() == 0 && fsin.getPos() >= end)
                return false;
        }
    }

}

此输入格式可与hive提供的JSON Serde一起使用,以读取多行JSON文件。

CREATE TABLE books (id string, bookname string, properties struct<subscription:string, unit:string>) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS INPUTFORMAT 'JsonInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

带样本的工作代码在这里:https://github.com/unayakdev/hive-json