当我使用pandas.DataFrame.apply时,它显示“具有多个元素的数组的真值不明确。请使用a.any()或a.all()”
import pandas as pd
import numpy as np
df=pd.DataFrame(np.random.randn(50,2))
def fun(df):
median=df.median()
mad=(df-median).median()
if df>(median+3.1483*mad):
df=median+3.1483*mad
elif df<(median-3.1483*mad):
df=median-3.1483*mad
return df
df.apply(fun)
答案 0 :(得分:2)
我认为需要:
def fun(df):
median=df.median()
mad=(df-median).median()
m1 = df>(median+3.1483*mad)
df= df.mask(m1, median+3.1483*mad)
m2 = df<(median-3.1483*mad)
df=df.mask(m2, median-3.1483*mad)
return df
df3 = df.apply(fun)
另一种解决方案,但是由于mad
与0
接近,因此3.1483*mad
和-3.1483*mad
也为零,因此在两列中均得到均值df.median()
的输出:
np.random.seed(786)
df = pd.DataFrame(np.random.randn(10,2))
print (df)
0 1
0 -0.799307 1.065501
1 -0.246459 0.856806
2 -1.505766 -1.071535
3 -0.027080 0.707278
4 -1.946180 0.653074
5 0.093481 -1.825020
6 1.990691 0.811006
7 2.367850 -0.388028
8 -2.174134 -1.161844
9 -0.279455 0.082329
median=df.median()
print (median)
0 -0.262957
1 0.367702
dtype: float64
mad=(df-median).median()
print (mad)
0 1.387779e-17
1 -2.775558e-17
dtype: float64
m1 = df.gt((median+3.1483*mad), axis=1)
m2 = df.lt((median-3.1483*mad), axis=1)
print (m1)
0 1
0 False True
1 True True
2 False False
3 True True
4 False True
5 True False
6 True True
7 True False
8 False False
9 False False
print (m2)
0 1
0 True False
1 False False
2 True True
3 False False
4 True False
5 False True
6 False False
7 False True
8 True True
9 True True
s1 = median+3.1483*mad #like s1 = median
s2 = median-3.1483*mad #like s2 = median
print (s2)
0 -0.262957
1 0.367702
dtype: float64
print (s1)
0 -0.262957
1 0.367702
dtype: float64
df3 = df.mask(m1, s1, axis=1).mask(m2, s2, axis=1)
print (df3)
0 1
0 -0.262957 0.367702
1 -0.262957 0.367702
2 -0.262957 0.367702
3 -0.262957 0.367702
4 -0.262957 0.367702
5 -0.262957 0.367702
6 -0.262957 0.367702
7 -0.262957 0.367702
8 -0.262957 0.367702
9 -0.262957 0.367702