我有一个如下所示的数据框
df = pd.DataFrame({
'subject_id':[1,1,1,1,1,1,1,2,2,2,2,2],
'time_1' :['2173-04-03 12:35:00','2173-04-03 12:50:00','2173-04-05
12:59:00','2173-05-04 13:14:00','2173-05-05 13:37:00','2173-07-06
13:39:00','2173-07-08 11:30:00','2173-04-08 16:00:00','2173-04-09
22:00:00','2173-04-11 04:00:00','2173- 04-13 04:30:00','2173-04-14 08:00:00'],
'val' :[5,5,5,5,1,6,5,5,8,3,4,6]})
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['month'] = df['time_1'].dt.month
从上方的数据框中可以看到,之间缺少一些日期。 我想为这些日期创建新记录,并填写前一行的值
def dt(df):
r = pd.date_range(start=df.date.min(), end=df.date.max())
df.set_index('date').reindex(r)
new_df = df.groupby(['subject_id','month']).apply(dt)
这将生成所有日期。我只想在每个月的每个主题的输入日期间隔内找到丢失的日期
我确实尝试过此related post中的代码。虽然它对我有所帮助,但没有为我提供此更新/新要求的预期输出。当我们离开连接时,它将复制所有记录。我也不能进行内部联接,因为它将删除不匹配的列。我想要左联接和内联接的组合
当前,它会创建我一年中不需要的所有365天的新记录。像下面这样。这不是预期的
我只希望在输入日期间隔之间添加缺少的日期,如下所示。例如subject = 1,在第4个月中有第3和5的记录。但是第4位缺失因此,我们仅添加第4天的记录。与电流输出不同,我们不需要第六,第七等。同样在第7个月,记录第7天的丢失情况。因此我们只为此添加一条新记录
我希望我的输出如下所示
答案 0 :(得分:3)
这是问题,您需要resample
来补充新的日期,所以有必要。
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['date'] = df['time_1'].dt.floor('d')
df1 = (df.set_index('date')
.groupby('subject_id')
.resample('d')
.last()
.index
.to_frame(index=False))
print (df1)
subject_id date
0 1 2173-04-03
1 1 2173-04-04
2 1 2173-04-05
3 1 2173-04-06
4 1 2173-04-07
.. ... ...
99 2 2173-04-10
100 2 2173-04-11
101 2 2173-04-12
102 2 2173-04-13
103 2 2173-04-14
[104 rows x 2 columns]
想法是删除不必要的缺失行-您可以为最小连续错色值(此处为5)创建阈值,并删除行(通过轻松测试创建新列):
df2 = df1.merge(df, how='left')
thresh = 5
mask = df2['day'].notna()
s = mask.cumsum().mask(mask)
df2['count'] = s.map(s.value_counts())
df2 = df2[(df2['count'] < thresh) | (df2['count'].isna())]
print (df2)
subject_id date time_1 val day count
0 1 2173-04-03 2173-04-03 12:35:00 5.0 3.0 NaN
1 1 2173-04-03 2173-04-03 12:50:00 5.0 3.0 NaN
2 1 2173-04-04 NaT NaN NaN 1.0
3 1 2173-04-05 2173-04-05 12:59:00 5.0 5.0 NaN
32 1 2173-05-04 2173-05-04 13:14:00 5.0 4.0 NaN
33 1 2173-05-05 2173-05-05 13:37:00 1.0 5.0 NaN
95 1 2173-07-06 2173-07-06 13:39:00 6.0 6.0 NaN
96 1 2173-07-07 NaT NaN NaN 1.0
97 1 2173-07-08 2173-07-08 11:30:00 5.0 8.0 NaN
98 2 2173-04-08 2173-04-08 16:00:00 5.0 8.0 NaN
99 2 2173-04-09 2173-04-09 22:00:00 8.0 9.0 NaN
100 2 2173-04-10 NaT NaN NaN 1.0
101 2 2173-04-11 2173-04-11 04:00:00 3.0 11.0 NaN
102 2 2173-04-12 NaT NaN NaN 1.0
103 2 2173-04-13 2173-04-13 04:30:00 4.0 13.0 NaN
104 2 2173-04-14 2173-04-14 08:00:00 6.0 14.0 NaN
最后使用以前的解决方案:
df2 = df2.groupby(df['subject_id']).ffill()
dates = df2['time_1'].dt.normalize()
df2['time_1'] += np.where(dates == df2['date'], 0, df2['date'] - dates)
df2['day'] = df2['time_1'].dt.day
df2['val'] = df2['val'].astype(int)
print (df2)
subject_id date time_1 val day count
0 1 2173-04-03 2173-04-03 12:35:00 5 3 NaN
1 1 2173-04-03 2173-04-03 12:50:00 5 3 NaN
2 1 2173-04-04 2173-04-04 12:50:00 5 4 1.0
3 1 2173-04-05 2173-04-05 12:59:00 5 5 1.0
32 1 2173-05-04 2173-05-04 13:14:00 5 4 NaN
33 1 2173-05-05 2173-05-05 13:37:00 1 5 NaN
95 1 2173-07-06 2173-07-06 13:39:00 6 6 NaN
96 1 2173-07-07 2173-07-07 13:39:00 6 7 1.0
97 1 2173-07-08 2173-07-08 11:30:00 5 8 1.0
98 2 2173-04-08 2173-04-08 16:00:00 5 8 1.0
99 2 2173-04-09 2173-04-09 22:00:00 8 9 1.0
100 2 2173-04-10 2173-04-10 22:00:00 8 10 1.0
101 2 2173-04-11 2173-04-11 04:00:00 3 11 1.0
102 2 2173-04-12 2173-04-12 04:00:00 3 12 1.0
103 2 2173-04-13 2173-04-13 04:30:00 4 13 1.0
104 2 2173-04-14 2173-04-14 08:00:00 6 14 1.0
编辑:每月reindex
的解决方案:
df['time_1'] = pd.to_datetime(df['time_1'])
df['day'] = df['time_1'].dt.day
df['date'] = df['time_1'].dt.floor('d')
df['month'] = df['time_1'].dt.month
df1 = (df.drop_duplicates(['date','subject_id'])
.set_index('date')
.groupby(['subject_id', 'month'])
.apply(lambda x: x.reindex(pd.date_range(x.index.min(), x.index.max())))
.rename_axis(('subject_id','month','date'))
.index
.to_frame(index=False)
)
print (df1)
subject_id month date
0 1 4 2173-04-03
1 1 4 2173-04-04
2 1 4 2173-04-05
3 1 5 2173-05-04
4 1 5 2173-05-05
5 1 7 2173-07-06
6 1 7 2173-07-07
7 1 7 2173-07-08
8 2 4 2173-04-08
9 2 4 2173-04-09
10 2 4 2173-04-10
11 2 4 2173-04-11
12 2 4 2173-04-12
13 2 4 2173-04-13
14 2 4 2173-04-14
df2 = df1.merge(df, how='left')
df2 = df2.groupby(df2['subject_id']).ffill()
dates = df2['time_1'].dt.normalize()
df2['time_1'] += np.where(dates == df2['date'], 0, df2['date'] - dates)
df2['day'] = df2['time_1'].dt.day
df2['val'] = df2['val'].astype(int)
print (df2)
subject_id month date time_1 val day
0 1 4 2173-04-03 2173-04-03 12:35:00 5 3
1 1 4 2173-04-03 2173-04-03 12:50:00 5 3
2 1 4 2173-04-04 2173-04-04 12:50:00 5 4
3 1 4 2173-04-05 2173-04-05 12:59:00 5 5
4 1 5 2173-05-04 2173-05-04 13:14:00 5 4
5 1 5 2173-05-05 2173-05-05 13:37:00 1 5
6 1 7 2173-07-06 2173-07-06 13:39:00 6 6
7 1 7 2173-07-07 2173-07-07 13:39:00 6 7
8 1 7 2173-07-08 2173-07-08 11:30:00 5 8
9 2 4 2173-04-08 2173-04-08 16:00:00 5 8
10 2 4 2173-04-09 2173-04-09 22:00:00 8 9
11 2 4 2173-04-10 2173-04-10 22:00:00 8 10
12 2 4 2173-04-11 2173-04-11 04:00:00 3 11
13 2 4 2173-04-12 2173-04-12 04:00:00 3 12
14 2 4 2173-04-13 2173-04-13 04:30:00 4 13
15 2 4 2173-04-14 2173-04-14 08:00:00 6 14
答案 1 :(得分:1)
有帮助吗?
def fill_dates(df):
result = pd.DataFrame()
for i,row in df.iterrows():
if i == 0:
result = result.append(row)
else:
start_date = result.iloc[-1]['time_1']
end_date = row['time_1']
# print(start_date, end_date)
delta = (end_date - start_date).days
# print(delta)
if delta > 0 and start_date.month == end_date.month:
for j in range(delta):
day = start_date + timedelta(days=j+1)
new_row = result.iloc[-1].copy()
new_row['time_1'] = day
new_row['remarks'] = 'added'
if new_row['time_1'].date() != row['time_1'].date():
result = result.append(new_row)
result = result.append(row)
else:
result = result.append(row)
result.reset_index(inplace = True)
return result