创建HIVE表时出现JSON解析错误

时间:2016-07-22 09:07:52

标签: json hadoop hive cloudera hiveql

首先,我是Hive的新手。

我通过apache flume获取了twitter数据。

{
"filter_level": "low",
"retweeted": false,
"in_reply_to_screen_name": null,
"possibly_sensitive": false,
"truncated": false,
"lang": "en",
"in_reply_to_status_id_str": null,
"id": 756378998838530048,
"in_reply_to_user_id_str": null,
"timestamp_ms": "1469169780822",
"in_reply_to_status_id": null,
"created_at": "Fri Jul 22 06:43:00 +0000 2016",
"favorite_count": 0,
"place": null,
"coordinates": null,
"text": "#furry pokemon sex mermaid sex position",
"contributors": null,
"geo": null,
"entities": {
    "symbols": [],
    "urls": [{
        "expanded_url": "http://14.gerase.tk",
        "indices": [40, 63],
        "display_url": "14.gerase.tk",
        "url": ""
    }],
    "hashtags": [{
        "text": "furry",
        "indices": [0, 6]
    }],
    "user_mentions": []
},
"is_quote_status": false,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client<\/a>",
"favorited": false,
"in_reply_to_user_id": null,
"retweet_count": 0,
"id_str": "756378998838530048",
"user": {
    "location": null,
    "default_profile": true,
    "statuses_count": 3436,
    "profile_background_tile": false,
    "lang": "ru",
    "profile_link_color": "2B7BB9",
    "id": 752318303280955392,
    "following": null,
    "favourites_count": 0,
    "protected": false,
    "profile_text_color": "333333",
    "verified": false,
    "description": null,
    "contributors_enabled": false,
    "profile_sidebar_border_color": "C0DEED",
    "name": "Мария Виноградова",
    "profile_background_color": "F5F8FA",
    "created_at": "Mon Jul 11 01:47:15 +0000 2016",
    "default_profile_image": false,
    "followers_count": 5,
    "profile_image_url_https": "https://pbs.twimg.com/profile_images/753398763201425408/X_2mAGt1_normal.jpg",
    "geo_enabled": false,
    "profile_background_image_url": "",
    "profile_background_image_url_https": "",
    "follow_request_sent": null,
    "url": null,
    "utc_offset": null,
    "time_zone": null,
    "notifications": null,
    "profile_use_background_image": true,
    "friends_count": 21,
    "profile_sidebar_fill_color": "DDEEF6",
    "screen_name": "afinafedorova2",
    "id_str": "752318303280955392",
    "profile_image_url": "http://pbs.twimg.com/profile_images/753398763201425408/X_2mAGt1_normal.jpg",
    "listed_count": 5,
    "is_translator": false
}

}

我正在使用https://github.com/cloudera/cdh-twitter-example中提到的HiveJSONSerDe 这是我试图运行的Hive查询。

create external table twitterdata(
filter_level string,
retweeted boolean,
in_reply_to_screen_name string,
possibly_sensitive boolean,
trauncated boolean,
lang string,
in_reply_to_status_id_str string,
id bigint,
in_reply_to_user_id_str string,
timestamp_ms string,
in_reply_to_status_id int,
created_at string,
favourite_count int,
place string,
coordinates string,
text string,
contributors string,
geo string,
entities STRUCT<
        urls:ARRAY<STRUCT<expanded_url:STRING,url:STRING>>,
    hashtags:ARRAY<STRUCT<text:STRING>>,
        user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>>,
is_quote_status boolean,
source string,
favotited boolean,
in_reply_to_user_id int,
retweet_count int,
id_str string,
user STRUCT<
    location:STRING,
    default_profile:boolean,
    statuses_count:int,
    profile_background_tile:boolean,
    lang: string,
    profile_link_color: string,
    id: bigint,
    following: string,
    protected: boolean,
    profile_text_color: string,
    verified: boolean,
    description: string,
    contributors_enabled: boolean,
    name: string,   
    created_at: string,
    default_profile_image: boolean,
    followers_count: int,
    profile_image_url_https: string,
    geo_enabled: boolean
    url: string,
    time_zone: string,
    friends_count: int,
    screen_name: string,
    id_str: string,
    listed_count: int,
    is_translator: boolean>
)
ROW FORMAT SERDE 'com.cloudera.hive.serde.JSONSerDe'
STORED AS TEXTFILE
LOCATION '/Twitter/Pokemon/';

但它显示有关解析JSON脚本的错误。

FAILED: Parse Error: line 31:2 mismatched input 'location' expecting identifier in column specification

我无法找到创建表查询中的错误。请帮助。

1 个答案:

答案 0 :(得分:0)

您可能需要查看数据文件中的该行。您的JSON文件最有可能是完整的,或者与模式不匹配。

如果您希望不断将数据加载到配置单元表,您可能还需要查看StreamSets。您可以使用它来捕获JSON数据,自动转换为Avro并填充和更新配置单元表。

完全披露,我是该项目的提交者。