struct Person{
char *name;
int numb;
char *var;
};
#define MK_PERSON(per1, name, var, numb) \
struct Person *per1=malloc(sizeof(struct Person));\
per1->name=malloc(strlen(name)+1);\
strcpy(per1->name,name);\
per1->numb=numb;\
per1->var=malloc(strlen(var)+1);\
strcpy(per1->var,var);
struct Person * mk_person(const char *name, const char *var, int numb){
struct Person *per1=malloc(sizeof(struct Person));
per1->name=malloc(strlen(name)+1);
strcpy(per1->name,name);
per1->numb=numb;
per1->var=malloc(strlen(var)+1);
strcpy(per1->var,var);
return per1;
}
int main(){
MK_PERSON(pers1, "Bob","m", 12);
struct Person *pers2 = mk_person("Mike", "m", 13);
}
如何在不影响“性别”,“已婚”,“教育”列的情况下进行编码 “贷款ID”,“受抚养人”,“申请人收入”列。
答案 0 :(得分:2)
这应该可以解决您的问题。
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for cat_var in ['Gender', 'Married', 'Education']:
df[cat_var] = le.fit_transform(df[cat_var])
答案 1 :(得分:1)
我更喜欢使用pd.get_dummies
方法,所以:
ohe_df = pd.get_dummies(df, columns=['Gender', 'Married', 'Education'])
答案 2 :(得分:1)
在准备数据时,请考虑以下几点:
LoanID列是有序的分类数据,需要使用一种热编码将其转换为数字,因为算法只能理解数字
标签编码器非常适合二进制类,对于使用一个热编码器或分解的多类尝试
为数值和转换后的分类数据创建单独的列,并在一个df中合并以进行训练和测试拆分
以您的问题为例:
#create ndarray for label encodoing (sklearn)
Gender = data.iloc[:,1:2].values
Married = data.iloc[:,2:3].values
Education = data.iloc[:,4:3].values
## le for Gender
le = LabelEncoder()
Gender[:,0] = le.fit_transform(Gender[:,0])
Gender = pd.DataFrame(Gender)
Gender.columns = ['Gender']
le_Gender_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Sklearn label encoder results for Gender:")
print(le_Gender_mapping)
**Do the same for 'Married' and 'Education' as they are also binary
Load_ID = data.iloc[:,0:1].values #ndarray
## ohe for Loan_ID
ohe = OneHotEncoder()
Load_ID = ohe.fit_transform(Loan_ID).toarray()
Load_ID = pd.DataFrame(Load_ID)
print("Sklearn one hot encoder results for Load_ID:")
##put data together
X_num = data[['Applicant_Income']].copy()
X_final = pd.concat([Loan_ID, Gender, Married, Education, X_num], axis = 1)
This prepares your initial data set, take out column you want to predict as y_final and do the train test split.
Note: After train test split do Normalize or Standardize(preferred as less affected by outliers) otherwise the Applicant_income will dominate the predictions
答案 3 :(得分:1)
您可以使用Label Encoder:
from sklearn import preprocessing
le1 = preprocessing.LabelEncoder()
df['Gender'] =le1.fit_transform(df['Gender'])
le2 = preprocessing.LabelEncoder()
df['Married'] =le2.fit_transform(df['Married'])
le3 = preprocessing.LabelEncoder()
df['Education'] =le3.fit_transform(df['Education'])
这种方法将为每一列使用不同的标签编码器,这也意味着您在不同的列中将具有相同的编号。
当全部运行一个标签编码器时,如果单词完全相同,则数字将相同。
分类后,您可以使用以下方法反转标签:
df['Married'] = le2.inverse_transform(df['Married']