以下是我要整理的数据示例:
location = [{'id': 225,
'country': 'US',
'country_code': 'US',
'country_population': 327167434,
'province': '',
'last_updated': '2020-05-06T11:33:46.184263Z',
'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'timelines': {'confirmed': {
'latest': 1204351,
'timeline': {
'2020-01-22T00:00:00Z': 1,
'2020-01-23T00:00:00Z': 1,
'2020-01-24T00:00:00Z': 2}
},
'deaths': {
'latest': 71064,
'timeline': {
'2020-01-22T00:00:00Z': 0,
'2020-01-23T00:00:00Z': 0,
'2020-01-24T00:00:00Z': 0}
}
}
}]
我无法以可用格式获得确认和死亡的时间表。我要么在每个日期都有一个列:
pd.json_normalize(
data = location,
record_path=['timelines','confirmed','timeline'])
或包含日期且没有计数值的行:
pd.json_normalize(data = location[0]['timelines']['confirmed'])
所需的输出类似于:
预先感谢;非常感谢您的协助。
答案 0 :(得分:3)
使用@Trenton Mckinney的数据将其进一步发展,我们可以在熊猫以外进行所有处理,并将制成品放入数据框:
第1步:创建一个函数来处理数据:
def extract_data(location,keyword):
collection = []
for ent in location:
#get the timeline data for either 'confirmed' or 'deaths'
#determined by the keyword
b = ent.get('timelines').get(keyword).get('timeline')
#switch out from dicts to a list of key value pairs
items = list(b.items())
#for each value in the items list,
#append the country and population
for val in items:
val = val + (ent.get('country'), ent.get('country_population'))
#collect each result into a collection list
collection.append(val)
#create a dataframe
cols = ['date', 'count', 'country', 'country_population']
res = pd.DataFrame(collection, columns = cols)
res = res.set_index(['country','country_population'])
#conditionals depending on the keyword
if keyword == "confirmed":
res = res.add_prefix('confirmed.timeline.')
elif keyword == "deaths":
res = res.add_prefix('deaths.timeline.')
return res
第2步:对每个关键字应用功能-“已确认”或“死亡”
confirmed = extract_data(location,'confirmed')
deaths = extract_data(location,'deaths')
第3步:在列轴上concatenate数据框:
pd.concat([confirmed,deaths],axis=1)
confirmed.timeline.date confirmed.timeline.count deaths.timeline.date deaths.timeline.count
country country_population
US 327167434 2020-01-22T00:00:00Z 1 2020-01-22T00:00:00Z 0
327167434 2020-01-23T00:00:00Z 1 2020-01-23T00:00:00Z 0
327167434 2020-01-24T00:00:00Z 2 2020-01-24T00:00:00Z 0
AF 327167435 2020-02-22T00:00:00Z 2 2020-02-22T00:00:00Z 1
327167435 2020-02-23T00:00:00Z 2 2020-02-23T00:00:00Z 1
327167435 2020-02-24T00:00:00Z 3 2020-02-24T00:00:00Z 1
AS 327167436 2020-03-22T00:00:00Z 3 2020-03-22T00:00:00Z 2
327167436 2020-03-23T00:00:00Z 3 2020-03-23T00:00:00Z 2
327167436 2020-03-24T00:00:00Z 4 2020-03-24T00:00:00Z 2
更新:喜欢@DanilaGanchar的反馈,因此决定重写代码。所有的计算都在Pandas外部进行,直到最后一刻:减速:853us
d = []
for entry in location:
#pull dictionary for country and population
country_population = {'country':entry.get('country'),
'country_population':entry.get('country_population')}
#pull data for the confirmed and death timelines
confirmed_timeline = entry.get('timelines').get('confirmed').get('timeline').items()
death_timeline = entry.get('timelines').get('deaths').get('timeline').items()
#set keys for the deaths and confirmed dicts
deaths = ('deaths.timeline.date', 'deaths.timeline.count')
confirmed = ('confirmed.timeline.date', 'confirmed.timeline.count')
#attach keys to the values in deaths and confirmed
#this will become the column names in the dataframe
confirmz = (dict(zip(confirmed,conf)) for conf in confirmed_timeline)
deathz = (dict(zip(deaths,death)) for death in death_timeline)
#aggregate the data into one dict
#and lump the dicts into one list
for content, cont in zip(confirmz, deathz):
content.update(country_population)
content.update(cont)
d.append(content)
pd.DataFrame(d)
confirmed.timeline.date confirmed.timeline.count country country_population deaths.timeline.date deaths.timeline.count
0 2020-01-22T00:00:00Z 1 US 327167434 2020-01-22T00:00:00Z 0
1 2020-01-23T00:00:00Z 1 US 327167434 2020-01-23T00:00:00Z 0
2 2020-01-24T00:00:00Z 2 US 327167434 2020-01-24T00:00:00Z 0
3 2020-02-22T00:00:00Z 2 AF 327167435 2020-02-22T00:00:00Z 1
4 2020-02-23T00:00:00Z 2 AF 327167435 2020-02-23T00:00:00Z 1
5 2020-02-24T00:00:00Z 3 AF 327167435 2020-02-24T00:00:00Z 1
6 2020-03-22T00:00:00Z 3 AS 327167436 2020-03-22T00:00:00Z 2
7 2020-03-23T00:00:00Z 3 AS 327167436 2020-03-23T00:00:00Z 2
8 2020-03-24T00:00:00Z 4 AS 327167436 2020-03-24T00:00:00Z 2
答案 1 :(得分:1)
confirmed.timeline.date
和deaths.timeline.date
的长度必须相同。import pandas as pd
# create a dataframe
top = pd.DataFrame(location)
# dataframe of desired columns
d = top[['country', 'country_population', 'timelines']].copy()
# transform timelines
d[['confirmed.timeline.date', 'confirmed.timeline.count']] = d.timelines.apply(lambda x: pd.Series([list(x['confirmed']['timeline'].keys()), list(x['confirmed']['timeline'].values())]))
d[['deaths.timeline.date', 'deaths.timeline.count']] = d.timelines.apply(lambda x: pd.Series([list(x['deaths']['timeline'].keys()), list(x['deaths']['timeline'].values())]))
d.drop(columns=['timelines'], inplace=True)
d
# explode the lists created in the last step and then join the data
cols = ['confirmed.timeline.date', 'confirmed.timeline.count', 'deaths.timeline.date', 'deaths.timeline.count']
d = d[['country', 'country_population']].join(pd.concat([d.explode(col)[col] for col in cols], axis=1))
print(d)
country country_population confirmed.timeline.date confirmed.timeline.count deaths.timeline.date deaths.timeline.count
0 US 327167434 2020-01-22T00:00:00Z 1 2020-01-22T00:00:00Z 0
0 US 327167434 2020-01-23T00:00:00Z 1 2020-01-23T00:00:00Z 0
0 US 327167434 2020-01-24T00:00:00Z 2 2020-01-24T00:00:00Z 0
1 AF 327167435 2020-02-22T00:00:00Z 2 2020-02-22T00:00:00Z 1
1 AF 327167435 2020-02-23T00:00:00Z 2 2020-02-23T00:00:00Z 1
1 AF 327167435 2020-02-24T00:00:00Z 3 2020-02-24T00:00:00Z 1
2 AS 327167436 2020-03-22T00:00:00Z 3 2020-03-22T00:00:00Z 2
2 AS 327167436 2020-03-23T00:00:00Z 3 2020-03-23T00:00:00Z 2
2 AS 327167436 2020-03-24T00:00:00Z 4 2020-03-24T00:00:00Z 2
location =
[{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'US',
'country_code': 'US',
'country_population': 327167434,
'id': 225,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-01-22T00:00:00Z': 1,
'2020-01-23T00:00:00Z': 1,
'2020-01-24T00:00:00Z': 2}},
'deaths': {'latest': 71064,
'timeline': {'2020-01-22T00:00:00Z': 0,
'2020-01-23T00:00:00Z': 0,
'2020-01-24T00:00:00Z': 0}}}},
{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'AF',
'country_code': 'AF',
'country_population': 327167435,
'id': 226,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-02-22T00:00:00Z': 2,
'2020-02-23T00:00:00Z': 2,
'2020-02-24T00:00:00Z': 3}},
'deaths': {'latest': 71064,
'timeline': {'2020-02-22T00:00:00Z': 1,
'2020-02-23T00:00:00Z': 1,
'2020-02-24T00:00:00Z': 1}}}},
{'coordinates': {'latitude': '37.0902', 'longitude': '-95.7129'},
'country': 'AS',
'country_code': 'AS',
'country_population': 327167436,
'id': 227,
'last_updated': '2020-05-06T11:33:46.184263Z',
'latest': {'confirmed': 1204351, 'deaths': 71064, 'recovered': 0},
'province': '',
'timelines': {'confirmed': {'latest': 1204351,
'timeline': {'2020-03-22T00:00:00Z': 3,
'2020-03-23T00:00:00Z': 3,
'2020-03-24T00:00:00Z': 4}},
'deaths': {'latest': 71064,
'timeline': {'2020-03-22T00:00:00Z': 2,
'2020-03-23T00:00:00Z': 2,
'2020-03-24T00:00:00Z': 2}}}}]
答案 2 :(得分:1)
我查看批准的答案。 2 df
,2 apply()
,1 copy()
,1 drop()
,1 concat()
,1 join()
。但是您只能使用json_normalize
来完成。 timeline
结构中的主要问题。是dict
中的list
,但不是dicts
中的timeline
。因此,您需要将[{
'confirmed_dt': '2020-01-22T00:00:00Z',
'confirmed_count': 0,
'deaths_dt': '2020-01-22T00:00:00Z',
'deaths_count': 0,
}, ...]
转换为:
def format_timelines(data: list):
for rec in data:
new_timelines = []
confirmed = rec['timelines']['confirmed']['timeline']
deaths = rec['timelines']['deaths']['timeline']
for (k, v), (k2, v2) in zip(confirmed.items(), deaths.items()):
# just set any keys(column names)
new_timelines.append({
'confirmed_dt': k,
'deaths_dt': k2,
'confirmed_count': v,
'deaths_count': v2,
})
rec['new_timelines'] = new_timelines
return data
df = json_normalize(
format_timelines(location),
['new_timelines'],
meta=['country', 'country_population']
)
只是一个例子:
{{1}}
我认为这更容易。希望这会有所帮助。
答案 3 :(得分:-1)
我有兴趣与Pandas进行拼字/重塑,并想通过该方法尝试解决方案。在每个时间轴日期行中复制最新值没有意义,因此我将数据分为两帧:最新时间和时间轴。还是很新的东西,所以任何反馈/潜在的改进都值得赞赏。
df = pd.json_normalize(data = location, sep = '_')
#list of current state / non-timeline columns
latest_cols = ['id', 'country', 'country_code', 'country_population', 'province', 'last_updated',
'coordinates_latitude', 'coordinates_longitude', 'latest_confirmed', 'latest_deaths',
'latest_recovered', 'timelines_confirmed_latest', 'timelines_deaths_latest',
'timelines_recovered_latest']
covid_latest = df[latest_cols]
# remove id & country from latest cols for inclusion w/ timeline
latest_cols = latest_cols[2:]
covid_timelines = df.drop(columns = latest_cols)
# reshape from wide to long
covid_timelines = pd.melt(
frame = covid_timelines,
id_vars = ['id', 'country'],
var_name = 'status_date',
value_name = 'case_count')
# split the timelines value and keep columns with date and type
timeline_cols = covid_timelines.status_date.str.split('_', expand=True)
timeline_cols.columns = ['timelines', 'case_type', 'timeline', 'str_date']
timeline_cols = timeline_cols.drop(columns = ['timelines', 'timeline'])
#add split cols back to covid_timelines
covid_timelines = pd.concat([covid_timelines, timeline_cols], axis=1)
# string to date and drop the dup date column
covid_timelines.status_date = pd.to_datetime(timeline_cols.str_date).dt.date
covid_timelines = covid_timelines.drop(columns = 'str_date')
#pivot death & confirmed from rows to cols
covid_timelines.pivot_table(
index = ['id', 'country', 'status_date'],
columns = 'case_type',
values = 'case_count' )