我使用标签编码器将分类数据转换为数值。
LabelEncoder如何处理缺失值?
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
le = LabelEncoder()
le.fit_transform(a)
输出:
array([1, 2, 3, 0, 4, 1])
对于上面的示例,标签编码器将NaN值更改为类别。我怎么知道哪个类别代表缺失值?
答案 0 :(得分:10)
不要将LabelEncoder
与缺失值一起使用。我不知道您使用的是scikit-learn
的哪个版本,但在0.17.1中,您的代码会引发TypeError: unorderable types: str() > float()
。
正如您所看到的in the source,它对要编码的数据使用numpy.unique
,如果找到缺失值则会引发TypeError
。如果要编码缺失值,请先将其类型更改为字符串:
a[pd.isnull(a)] = 'NaN'
答案 1 :(得分:4)
你好,我为自己的工作做了一些计算黑客攻击:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
le = LabelEncoder()
### fit with the desired col, col in position 0 for this example
fit_by = pd.Series([i for i in a.iloc[:,0].unique() if type(i) == str])
le.fit(fit_by)
### Set transformed col leaving np.NaN as they are
a["transformed"] = fit_by.apply(lambda x: le.transform([x])[0] if type(x) == str else x)
答案 2 :(得分:2)
您可以通过某些值填充na,然后将数据框列类型更改为字符串以使其正常工作。
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
a.fillna(99)
le = LabelEncoder()
le.fit_transform(a.astype(str))
答案 3 :(得分:2)
这是我的解决方案,因为我对此处发布的解决方案不满意。我需要一个LabelEncoder来将丢失的值保留为“ NaN”,以便以后使用Imputer。因此,我编写了自己的LabelEncoder类。它适用于DataFrames。
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
class LabelEncoderByCol(BaseEstimator, TransformerMixin):
def __init__(self,col):
#List of column names in the DataFrame that should be encoded
self.col = col
#Dictionary storing a LabelEncoder for each column
self.le_dic = {}
for el in self.col:
self.le_dic[el] = LabelEncoder()
def fit(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
self.le_dic[el].fit(a)
return self
def transform(self,x,y=None):
#Fill missing values with the string 'NaN'
x[self.col] = x[self.col].fillna('NaN')
for el in self.col:
#Only use the values that are not 'NaN' to fit the Encoder
a = x[el][x[el]!='NaN']
#Store an ndarray of the current column
b = x[el].get_values()
#Replace the elements in the ndarray that are not 'NaN'
#using the transformer
b[b!='NaN'] = self.le_dic[el].transform(a)
#Overwrite the column in the DataFrame
x[el]=b
#return the transformed DataFrame
return x
您不仅可以输入1维系列,还可以输入DataFrame。使用col您可以选择应编码的列。
我想在这里提供一些反馈。
答案 4 :(得分:1)
you can also use a mask to replace form the original data frame after labelling
df = pd.DataFrame({'A': ['x', np.NaN, 'z'], 'B': [1, 6, 9], 'C': [2, 1, np.NaN]})
A B C
0 x 1 2.0
1 NaN 6 1.0
2 z 9 NaN
dfTmp = df
mask = df_1.isnull()
A B C
0 False False False
1 True False False
2 False False True
df = df.astype(str).apply(LabelEncoder().fit_transform)
df.where(~mask, original)
A B C
0 1.0 0 1.0
1 NaN 1 0.0
2 2.0 2 NaN
答案 5 :(得分:1)
该模块在my Github中也可用示例进行组织。
如果您喜欢我的解决方案,请投票。
Tks, 伊丹
label_encoder_contain_missing_values类:
def __init__ (self) :
pass
def categorical_to_numeric (self,dataset):
import numpy as np
import pandas as pd
self.dataset = dataset
self.summary = None
self.table_encoder= {}
for index in self.dataset.columns :
if self.dataset[index].dtypes == 'object' :
column_data_frame = pd.Series(self.dataset[index],name='column').to_frame()
unique_values = pd.Series(self.dataset[index].unique())
i = 0
label_encoder = pd.DataFrame({'value_name':[],'Encode':[]})
while i <= len(unique_values)-1:
if unique_values.isnull()[i] == True :
label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':np.nan}, ignore_index=True) #np.nan = -1
else:
label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':i}, ignore_index=True)
i+=1
output = pd.merge(left=column_data_frame,right = label_encoder, how='left',left_on='column',right_on='value_name')
self.summary = output[['column','Encode']].drop_duplicates().reset_index(drop=True)
self.dataset[index] = output.Encode
self.table_encoder.update({index:self.summary})
else :
pass
# ---- Show Encode Table ----- #
print('''\nLabel Encoding completed in Successfully.\n
Next steps: \n
1. To view table_encoder, Execute the follow: \n
for index in table_encoder :
print(f'\\n{index} \\n',table_encoder[index])
2. For inverse, execute the follow : \n
df = label_encoder_contain_missing_values().
inverse_numeric_to_categorical(table_encoder, df) ''')
return self.table_encoder ,self.dataset
def inverse_numeric_to_categorical (self,table_encoder, df):
dataset = df.copy()
for column in table_encoder.keys():
df_column = df[column].to_frame()
output = pd.merge(left=df_column,right = table_encoder[column], how='left',left_on= column,right_on='Encode')#.rename(columns={'column_x' :'encode','column_y':'category'})
df[column]= output.column
print('\nInverse Label Encoding, from categorical to numerical completed in Successfully.\n')
return df
从分类到数字执行命令
table_encoder,df = label_encoder_contain_missing_values()。categorical_to_numeric(df)
从数字到分类执行命令
df = label_encoder_contain_missing_values()。inverse_numeric_to_categorical(table_encoder,df)
答案 6 :(得分:0)
我遇到了同样的问题,但上述都没有对我有用。所以我在训练数据中添加了一个新行,仅包含“nan”
答案 7 :(得分:0)
以下编码器地址每个类别中没有值。
class MultiColumnLabelEncoder:
def __init__(self):
self.columns = None
self.led = defaultdict(preprocessing.LabelEncoder)
def fit(self, X):
self.columns = X.columns
for col in self.columns:
cat = X[col].unique()
cat = [x if x is not None else "None" for x in cat]
self.led[col].fit(cat)
return self
def fit_transform(self, X):
if self.columns is None:
self.fit(X)
return self.transform(X)
def transform(self, X):
return X.apply(lambda x: self.led[x.name].transform(x.apply(lambda e: e if e is not None else "None")))
def inverse_transform(self, X):
return X.apply(lambda x: self.led[x.name].inverse_transform(x))
使用示例
df = pd.DataFrame({
'pets': ['cat', 'dog', 'cat', 'monkey', 'dog', 'dog'],
'owner': ['Champ', 'Ron', 'Brick', None, 'Veronica', 'Ron'],
'location': ['San_Diego', 'New_York', 'New_York', 'San_Diego', 'San_Diego',
None]
})
print(df)
location owner pets
0 San_Diego Champ cat
1 New_York Ron dog
2 New_York Brick cat
3 San_Diego None monkey
4 San_Diego Veronica dog
5 None Ron dog
le = MultiColumnLabelEncoder()
le.fit(df)
transformed = le.transform(df)
print(transformed)
location owner pets
0 2 1 0
1 0 3 1
2 0 0 0
3 2 2 2
4 2 4 1
5 1 3 1
inverted = le.inverse_transform(transformed)
print(inverted)
location owner pets
0 San_Diego Champ cat
1 New_York Ron dog
2 New_York Brick cat
3 San_Diego None monkey
4 San_Diego Veronica dog
5 None Ron dog
答案 8 :(得分:0)
这是我的方法:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
UNKNOWN_TOKEN = '<unknown>'
a = pd.Series(['A','B','C', 'D','A'], dtype=str).unique().tolist()
a.append(UNKNOWN_TOKEN)
le = LabelEncoder()
le.fit_transform(a)
embedding_map = dict(zip(le.classes_, le.transform(le.classes_)))
以及应用于新的测试数据时:
test_df = test_df.apply(lambda x: x if x in embedding_map else UNKNOWN_TOKEN)
le.transform(test_df)
答案 9 :(得分:0)
这是一个简单的方法
这是Titanic
的示例LABEL_COL = ["Sex", "Embarked"]
def label(df):
_df = df.copy()
le = LabelEncoder()
for col in LABEL_COL:
# Not NaN index
idx = ~_df[col].isna()
_df.loc[idx, col] \
= le.fit(_df.loc[idx, col]).transform(_df.loc[idx, col])
return _df
答案 10 :(得分:0)
我也想贡献我的解决方法,因为在处理包含缺失值的分类数据时,我发现其他人更加繁琐
# Create a random dataframe
foo = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
# Randomly intersperse column 'A' with missing data (NaN)
foo['A'][np.random.randint(0,len(foo), size=20)] = np.nan
# Convert this series to string, to simulate our problem
series = foo['A'].astype(str)
# np.nan are converted to the string "nan", mask these out
mask = (series == "nan")
# Apply the LabelEncoder to the unmasked series, replace the masked series with np.nan
series[~mask] = LabelEncoder().fit_transform(series[~mask])
series[mask] = np.nan
foo['A'] = series
答案 11 :(得分:0)
@Kerem投票最多的答案有错别字,因此,我在此处发布了经过更正和改进的答案:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
a = pd.DataFrame(['A','B','C',np.nan,'D','A'])
for j in a.columns.values:
le = LabelEncoder()
### fit with the desired col, col in position 0 for this ###example
fit_by = pd.Series([i for i in a[j].unique() if type(i) == str])
le.fit(fit_by)
### Set transformed col leaving np.NaN as they are
a["transformed"] = a[j].apply(lambda x: le.transform([x])[0] if type(x) == str else x)
答案 12 :(得分:0)
您可以通过将缺失值替换为字符串“ NaN”来处理缺失值。可以通过le.transfrom()获得该类别。
le.fit_transform(a.fillna('NaN'))
category = le.transform(['NaN'])
另一种解决方案是让标签编码器忽略缺失值。
a = le.fit_transform(a.astype(str))
答案 13 :(得分:0)
此函数从数据框中获取一列,并返回仅对非NaN进行标签编码的列,其余保持不变
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def label_encode_column(col):
nans = col.isnull()
nan_lst = []
nan_idx_lst = []
label_lst = []
label_idx_lst = []
for idx, nan in enumerate(nans):
if nan:
nan_lst.append(col[idx])
nan_idx_lst.append(idx)
else:
label_lst.append(col[idx])
label_idx_lst.append(idx)
nan_df = pd.DataFrame(nan_lst, index=nan_idx_lst)
label_df = pd.DataFrame(label_lst, index=label_idx_lst)
label_encoder = LabelEncoder()
label_df = label_encoder.fit_transform(label_df.astype(str))
label_df = pd.DataFrame(label_df, index=label_idx_lst)
final_col = pd.concat([label_df, nan_df])
return final_col.sort_index()