在下面的函数中,我正在使用Pandas数据框。我要引入一个数据框,并立即重置索引。然后,我对该数据帧进行复制,以免出现任何链式分配问题。
然后,我想使用.dropna(inplace=True, subset = [header], axis=0)
删除我感兴趣的列(header
)为nan
的任何行。但是,一旦我进入for循环,很明显nan
值并没有下降,因为我不断收到如下警告:
RuntimeWarning:空切片的均值
这是我的数组neighbors
具有所有nan
值的结果。
我的问题:在我使用df_copy.dropna(inplace=True, subset=[header], axis=0)
的那一行中,我似乎并没有永久丢掉那些行?
n_samples = 10
tolerance = 1.5
dataframe = pd.read_csv('my_file.csv')
def removeOutliers(dataframe, header):
dataframe.reset_index(inplace=True, drop=True)
df_copy = dataframe.copy()
#Why doesn't the below actually drop the NaNs?
df_copy.dropna(inplace=True, subset=[header], axis=0)
for ii in range(len(df_copy['Lng'])):
a = df_copy.iloc[ii]['Lng'] - df_copy.iloc[:]['Lng']
b = df_copy.iloc[ii]['Lat'] - df_copy.iloc[:]['Lat']
c = np.array((a**2 + b**2)**0.5 )
d = np.zeros((len(df_copy['Lng'])))
e = np.zeros((len(df_copy['Lng'])))
d[:] = df_copy.iloc[:]['Well']
e[:] = df_copy.iloc[:][header]
idx = np.argpartition(c, n_samples+1)
max_loc = np.where(e[idx[0:n_samples+1]] == e[ii])
neighbors = np.delete(e[idx[0:n_samples+1]], max_loc)
avg = np.nanmean(neighbors)
std = np.nanstd(neighbors)
if df_copy.iloc[ii][header] > (avg + tolerance*std) or df_copy.iloc[ii][header] < (avg - tolerance*std):
df_copy.iloc[ii, df_copy.columns.get_loc(header)] = np.nan
return df_copy
test_data = removeOutliers(dataframe, 'myColumn')