我正在尝试使用Hive JSON SerDe将Twitter JSON放入Hive表中。我首先将JSON导入到由ROW FORMAT SERDE定义的一个表中,然后将其导入到另一个存储为RCFile的表中。它可以达到一定程度,但后来我得到了以下性质的ClassCastException:
java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row [Error getting row data with exception java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.Double
at org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaDoubleObjectInspector.get(JavaDoubleObjectInspector.java:40)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:259)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:307)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.buildJSONString(SerDeUtils.java:354)
at org.apache.hadoop.hive.serde2.SerDeUtils.getJSONString(SerDeUtils.java:220)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:667)
at org.apache.hadoop.hive.ql.exec.ExecMapper.map(ExecMapper.java:141)
at org.apache.hadoop
这是我用来定义SerDe表的模式:
CREATE EXTERNAL TABLE gh_raw (
coordinates struct <
coordinates: array <double>,
type: string>,
created_at string,
entities struct <
hashtags: array <struct <text: string>>,
media: array <struct <
display_url: string,
expanded_url: string,
media_url: string,
media_url_https: string,
sizes: struct <
large: struct <
h: int,
resize: string,
w: int>,
medium: struct <
h: int,
resize: string,
w: int>,
small: struct <
h: int,
resize: string,
w: int>,
thumb: struct <
h: int,
resize: string,
w: int>>,
type: string,
url: string>>,
urls: array <struct <
display_url: string,
expanded_url: string,
url: string>>,
user_mentions: array <struct <
id: int,
name: string,
screen_name: string>>>,
geo struct <
coordinates: array <double>,
type: string>,
id_str string,
in_reply_to_screen_name string,
in_reply_to_status_id_str string,
in_reply_to_user_id_str string,
place struct <
attributes: struct <
locality: string,
region: string,
street_address: string>,
bounding_box: struct <
coordinates: array <array <array <double>>>,
type: string>,
country: string,
country_code: string,
full_name: string,
name: string,
place_type: string,
url: string>,
possibly_sensitive boolean,
retweeted_status struct <
coordinates: struct <
coordinates: array <double>,
type: string>,
created_at: string,
entities: struct <
hashtags: array <struct <
text: string>>,
media: array <struct <
display_url: string,
expanded_url: string,
media_url: string,
media_url_https: string,
sizes: struct <
large: struct <
h: int,
resize: string,
w: int>,
medium: struct <
h: int,
resize: string,
w: int>,
small: struct <
h: int,
resize: string,
w: int>,
thumb: struct <
h: int,
resize: string,
w: int>>,
type: string,
url: string>>,
urls: array <struct <
display_url: string,
expanded_url: string,
url: string>>,
user_mentions: array <struct <
id: int,
name: string,
screen_name: string>>>,
favorited: boolean,
geo: struct <
coordinates: array <double>,
type: string>,
id_str: string,
in_reply_to_screen_name: string,
in_reply_to_status_id_str: string,
in_reply_to_user_id_str: string,
place: struct <
attributes: struct <
locality: string,
region: string,
street_address: string
>,
bounding_box: struct <
coordinates: array <array <array <double>>>,
type: string>,
country: string,
country_code: string,
full_name: string,
name: string,
place_type: string,
url: string>,
possibly_sensitive: boolean,
scopes: struct <
followers: boolean>,
source: string,
text: string,
truncated: boolean,
user: struct <
contributors_enabled: boolean,
created_at: string,
default_profile: boolean,
default_profile_image: boolean,
description: string,
favourites_count: int,
followers_count: int,
friends_count: int,
geo_enabled: boolean,
id: int,
id_str: string,
is_translator: boolean,
lang: string,
listed_count: int,
`location`: string,
name: string,
profile_background_color: string,
profile_background_image_url: string,
profile_background_image_url_https: string,
profile_background_tile: boolean,
profile_banner_url: string,
profile_image_url: string,
profile_image_url_https: string,
profile_link_color: string,
profile_sidebar_border_color: string,
profile_sidebar_fill_color: string,
profile_text_color: string,
profile_use_background_image: boolean,
protected: boolean,
screen_name: string,
statuses_count: int,
time_zone: string,
url: string,
utc_offset: int,
verified: boolean>>,
source string,
text string,
truncated boolean,
user struct <
contributors_enabled: boolean,
created_at: string,
default_profile: boolean,
default_profile_image: boolean,
description: string,
favourites_count: int,
followers_count: int,
friends_count: int,
geo_enabled: boolean,
id: int,
id_str: string,
is_translator: boolean,
lang: string,
listed_count: int,
`location`: string,
name: string,
profile_background_color: string,
profile_background_image_url: string,
profile_background_image_url_https: string,
profile_background_tile: boolean,
profile_banner_url: string,
profile_image_url: string,
profile_image_url_https: string,
profile_link_color: string,
profile_sidebar_border_color: string,
profile_sidebar_fill_color: string,
profile_text_color: string,
profile_use_background_image: boolean,
protected: boolean,
screen_name: string,
statuses_count: int,
time_zone: string,
url: string,
utc_offset: int,
verified: boolean>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION '/user/ahanna/gh_raw';
我发现当找到一组坐标或一个边界框时,这会崩溃。
我认为这是我使用的JSON SerDe的一个错误,但我不确定。我已经从头开始编译了我正在使用的那个人,他说他们已经解决了这个问题,但没有去:https://github.com/brndnmtthws/Hive-JSON-Serde
答案 0 :(得分:1)
试试这个SerDe - https://github.com/rcongiu/Hive-JSON-Serde。 在尝试从推文中读取坐标时,我得到了相同的异常。用它来修复它!
二进制文件可在此处使用,因此您无需构建它 - http://www.congiu.net/hive-json-serde/
答案 1 :(得分:0)
尝试使用bigint而不是int。它对我有用。