在变量日期首次变为4之后,我试图删除数据框中每个组的后续行。
df = pd.DataFrame({"date": [1,2,3,3,4,1,2,3,3,4,1,1,1,4,4,4,1,1,1,2,2,3,3,3,4,4],
"variable": ["A", "A", "A","A","A","A", "A", "A","A","A", "B", "B", "B","B","B","B" ,"C", "C", "C","C", "D","D","D","D","D","D"],
"no": [1, 2.2, 3.5, 1.5, 1.5,1, 2.2, 3.5, 1.5, 1.5, 1.2, 1.3, 1.1, 2, 3,1, 2.2, 3.5, 1.5, 1.5, 1.2, 1.3, 1.1, 2, 3,9],
"value": [0.469112, -0.282863, -1.509059, -1.135632, 1.212112,0.469112, -0.282863, -1.509059, -1.135632, 1.212112, -0.173215,
0.119209, -1.044236, -0.861849, None,0.469112, -0.282863, -1.509059, -1.135632, 1.212112, -0.173215,
0.119209, -1.044236, -0.861849, None,0.87]})
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 4 A 1.5 1.212112
5 1 A 1.0 0.469112
6 2 A 2.2 -0.282863
7 3 A 3.5 -1.509059
8 3 A 1.5 -1.135632
9 4 A 1.5 1.212112
10 1 B 1.2 -0.173215
11 1 B 1.3 0.119209
12 1 B 1.1 -1.044236
13 4 B 2.0 -0.861849
14 4 B 3.0 NaN
15 4 B 1.0 0.469112
16 1 C 2.2 -0.282863
17 1 C 3.5 -1.509059
18 1 C 1.5 -1.135632
19 2 C 1.5 1.212112
20 2 D 1.2 -0.173215
21 3 D 1.3 0.119209
22 3 D 1.1 -1.044236
23 3 D 2.0 -0.861849
24 4 D 3.0 NaN
25 4 D 9.0 0.870000
到目前为止,我已经尝试了以下方法:
def tail_test(group):
group = group[~(group.date.eq(4) | group.date.shift().eq(4))]
return group
df_sub = df.groupby('variable').apply(tail_test).reset_index(drop=True)
输出:
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 2 A 2.2 -0.282863
5 3 A 3.5 -1.509059
6 3 A 1.5 -1.135632
7 1 B 1.2 -0.173215
8 1 B 1.3 0.119209
9 1 B 1.1 -1.044236
10 1 C 2.2 -0.282863
11 1 C 3.5 -1.509059
12 1 C 1.5 -1.135632
13 2 C 1.5 1.212112
14 2 D 1.2 -0.173215
15 3 D 1.3 0.119209
16 3 D 1.1 -1.044236
17 3 D 2.0 -0.861849
基本上,它会删除所有4,但此后不会删除所有后续值
我尝试的下一个方法是:
def f(df):
mask = (df.date == 4).cumsum() <= 1
return df[mask]
df_sub = df.groupby("variable").apply(f)
输出为:
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
4 4 A 1.5 1.212112
5 1 A 1.0 0.469112
6 2 A 2.2 -0.282863
7 3 A 3.5 -1.509059
8 3 A 1.5 -1.135632
9 1 B 1.2 -0.173215
10 1 B 1.3 0.119209
11 1 B 1.1 -1.044236
12 4 B 2.0 -0.861849
13 1 C 2.2 -0.282863
14 1 C 3.5 -1.509059
15 1 C 1.5 -1.135632
16 2 C 1.5 1.212112
17 2 D 1.2 -0.173215
18 3 D 1.3 0.119209
19 3 D 1.1 -1.044236
20 3 D 2.0 -0.861849
21 4 D 3.0 NaN
我可能犯了一些我不知道的愚蠢错误。请帮忙!
答案 0 :(得分:1)
IIUC,可以groupby
并使用cumprod
来检测何时开始过滤
df[df.groupby('variable').date.transform(lambda s: s.ne(4).cumprod().astype(bool))]
date variable no value
0 1 A 1.0 0.469112
1 2 A 2.2 -0.282863
2 3 A 3.5 -1.509059
3 3 A 1.5 -1.135632
10 1 B 1.2 -0.173215
11 1 B 1.3 0.119209
12 1 B 1.1 -1.044236
16 1 C 2.2 -0.282863
17 1 C 3.5 -1.509059
18 1 C 1.5 -1.135632
19 2 C 1.5 1.212112
20 2 D 1.2 -0.173215
21 3 D 1.3 0.119209
22 3 D 1.1 -1.044236
23 3 D 2.0 -0.861849