我正试图规范我的数据[-1,1],并通过将它们四舍五入为-1,+ 1的端点来消除异常值,因为我不想删除它们并将它们最终保存在最终的dataFrame中所谓的df_norm
。
我的数据是txt文件,如下:
id_set: 000
A: -2.46882615679
B: -2.26408246559
C: -325.004619528
但是我在第一种方法和第二种方法中分别遇到以下错误:
ValueError
Can only tuple-index with a MultiIndex
File "D:\results\erfan - normG.py", line 210, in <module>
data_norm = {'Sx-Sy': Sx_Sy_norm[:,0], 'Sxy': Sxy_norm[:,0], 'Temperature': Temperature_norm[:,0]}
下一个:
AttributeError
'numpy.ndarray' object has no attribute 'iat'
File "D:\results\erfan - normG.py", line 71, in outlier_fix
if (data.iat[i] > _max):
File "D:\results\erfan - normG.py", line 191, in <module>
new_value11 = outlier_fix(new_value1 , -1 , 1)
我的完整脚本如下:
import numpy as np
import pandas as pd
def normalize(value, min_value, max_value, min_norm, max_norm):
new_value = ((max_norm - min_norm)*((value - min_value)/(max_value - min_value))) + min_norm
return new_value
def outlier_fix(data, _min, _max):
for i in range (0, data.size):
if (data.iat[i] > _max):
data.iat[i] = _max
if (data.iat[i] < _min):
data.iat[i] = _min
return data
dft = pd.read_csv('D:\mc25s.txt', header=None)
id_set = dft[dft.index % 4 == 0].astype('int').values
A = dft[dft.index % 4 == 1].values
B = dft[dft.index % 4 == 2].values
C = dft[dft.index % 4 == 3].values
data = {'A': A[:,0], 'B': B[:,0], 'C': C[:,0]}
df = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#--------------------1st approach----------------------
for i in df:
if i == 'A':
min_val = df[i].min()
max_val = df[i].max()
new_value1 = normalize(df['A'], min_val, max_val, -1, 1)
new_value11 = outlier_fix(new_value1 , -1 , 1)
A_norm = new_value11
if i == 'B':
min_val = df[i].min()
max_val = df[i].max()
new_value2 = normalize(df['B'], min_val, max_val, -1, 1)
new_value22 = outlier_fix(new_value2 , -1 , 1)
B_norm = new_value22
if i == 'C':
min_val = df[i].min()
max_val = df[i].max()
new_value3 = normalize(df['C'], min_val, max_val, -1, 1)
new_value33 = outlier_fix(new_value3 , -1 , 1)
C_norm = new_value33
data_norm = {'A': A_norm[:,0], 'B': B_norm[:,0], 'C': C_norm[:,0]}
df_norm = pd.DataFrame(data_norm, columns=['A','B','C'], index=None)
df_norm.to_csv('m25_norm.csv', na_rep='nan', encoding='utf-8', index=False)
#-----------------------2nd approach----------------------
for i in df:
if i == 'A':
min_val = df[i].min()
max_val = df[i].max()
new_value1 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value11 = outlier_fix(new_value1 , -1 , 1)
A_norm = new_value11
if i == 'B':
min_val = df[i].min()
max_val = df[i].max()
new_value2 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value22 = outlier_fix(new_value2 , -1 , 1)
B_norm = new_value22
if i == 'C':
min_val = df[i].min()
max_val = df[i].max()
new_value3 = normalize(df[i].values, min_val, max_val, -1, 1)
new_value33 = outlier_fix(new_value3 , -1 , 1)
C_norm = new_value33
df_norm2 = pd.DataFrame(df , index = id_set[:,0])
df_norm2.to_csv('mc25s_norm.csv', na_rep='nan', encoding='utf-8', index=False)
任何帮助都将受到欢迎。