我构建了一个函数,用于对由dummy设置的属性数据进行规范化。如果每行的一个值的数量> 1,我希望将值从0变为1,从1变为0。数字为零:
def dummy_data(data, columns):
for column in columns:
data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
data = data.drop(column, axis=1)
n_zero = (data == 0).astype(int).sum(axis=1)
n_uno = (data == 1).astype(int).sum(axis=1)
for i in range(len(n_zero)):
if n_uno[i] > n_zero[i]:
#replace_values = {0: 1, 1: 0}
#data.iloc[i] = data.iloc[i].replace({data.iloc[i]: replace_values})
data.iloc[i] = data.iloc[i].map({0 : 1})
data.iloc[i] = data.iloc[i].map({1 : 0})
return data
dummy_columns = ["ATTRIBUTE1",..."ATTRIBIUTE N"]
df=dummy_data(df, dummy_columns)
该功能不能取代我的零和一个值
答案 0 :(得分:1)
我认为你需要:
def dummy_data(data, columns):
#get_dummies with all columns together
data = pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
#convert to int not necessary
n_zero = (data == 0).sum(axis=1)
n_uno = (data == 1).sum(axis=1)
#replace by condition without loop
m = n_uno > n_zero
data = data.mask(m, data.replace({0:1,1:0}))
return data
样品:
df = pd.DataFrame({'A':list('abb'),
'B':list('bbb'),
'C':list('baa'),
'D':list('aaa')})
print (df)
A B C D
0 a b b a
1 b b a a
2 b b a a
def dummy_data(data, columns):
data = pd.concat([data, pd.get_dummies(data[columns])], axis=1).drop(columns, axis=1)
print (data)
D A_a A_b B_b C_a C_b
0 a 1 0 1 0 1
1 a 0 1 1 1 0
2 a 0 1 1 1 0
n_zero = (data == 0).sum(axis=1)
n_uno = (data == 1).sum(axis=1)
m = n_uno > n_zero
print (m)
0 True
1 True
2 True
dtype: bool
data = data.mask(m, data.replace({0:1,1:0}))
return data
dummy_columns = ['A','B', 'C']
df = dummy_data(df, dummy_columns)
print (df)
D A_a A_b B_b C_a C_b
0 a 0 1 0 1 0
1 a 1 0 0 0 1
2 a 1 0 0 0 1
答案 1 :(得分:0)
使用numpy logical_not:
快速查找和反转1和0的方法def dummy_data(data_df, dummy_columns):
static_df = data_df[list(set(data.columns) - set(dummy_columns))]
df = pd.get_dummies(data_df[dummy_columns])
vals = df.values
ones_count = np.add.reduce(vals, axis=1)
zeros_count = np.add.reduce(np.logical_not(vals), axis=1)
idx = np.where(ones_count > zeros_count)[0]
vals[idx, :] = np.logical_not(vals[idx, :])
result_df = pd.concat([static_df, pd.DataFrame(vals, index=df.index, columns=df.columns)], axis=1)
return result_df