我正在尝试使用read_json
处理带有pandas的嵌套json,但我得到的重复条目如下所示:
contributors_enabled 2013-11-30 20:48:42
created_at 2013-11-30 20:48:42
default_profile 2013-11-30 20:48:42
default_profile_image 2013-11-30 20:48:42
description 2013-11-30 20:48:42
favourites_count 2013-11-30 20:48:42
follow_request_sent 2013-11-30 20:48:42
...
source \
contributors_enabled <a href="http://twitter.com/download/iphone" r...
created_at <a href="http://twitter.com/download/iphone" r...
default_profile <a href="http://twitter.com/download/iphone" r...
default_profile_image <a href="http://twitter.com/download/iphone" r...
description <a href="http://twitter.com/download/iphone" r...
favourites_count <a href="http://twitter.com/download/iphone" r...
follow_request_sent...
如何将其更改为正确的格式?
我使用以下代码:
for line in gzip.open("/home/amrith/shared/twitter-stream/tweets-1385844523.txt.gz"):
tweet = read_json(line)
print tweet
输入行如下所示:
{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":10407,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"}
答案 0 :(得分:2)
dicts
_source_list
def flatten_json(nested_json: dict, exclude: list=['']) -> dict:
"""
Flatten a list of nested dicts.
"""
out = dict()
def flatten(x: (list, dict, str), name: str='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude:
flatten(x[a], f'{name}{a}_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, f'{name}{i}_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
要创建数据集,我使用了给定的数据三次,
data
是json
data = {'stuff': [{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"},
{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"},
{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"}]}
data
是list
的{{1}} dicts
data = [{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"},
{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"},
{"created_at":"Sun Dec 01 01:19:00 +0000 2013","id":12345,"id_str":"12345","text":"Todo va a estar bn :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":False,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":54321,"id_str":"54321","name":"xxxxx","screen_name":"xxxxx","location":"","url":null,"description":"No pretendo ser nadie mas y no soy perfecta lo se, tengo muchos errores tambi\u00e9n lo se pero me acepto y me amo como soy.","protected":false,"followers_count":71,"friends_count":64,"listed_count":0,"created_at":"Sun Feb 05 02:04:16 +0000 2012","favourites_count":218,"utc_offset":-21600,"time_zone":"Central Time (US & Canada)","geo_enabled":True,"verified":False,"statuses_count":10407,"lang":"es","contributors_enabled":False,"is_translator":False,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000116209016\/ff11dc9f5a2e05d2800a91cff08c2c73.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000736604157\/b6d36df6332a2cacb0d30b5328b668d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/483470963\/1385144720","profile_link_color":"9D1DCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":True,"default_profile":False,"default_profile_image":False,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[]},"favorited":False,"retweeted":False,"filter_level":"medium","lang":"es"}]
:flatten_json
是data
,如上所述json
df = pd.DataFrame([flatten_json(x) for x in data['stuff'])
是data
的{{1}},则如上所述list
dicts