我正在进行信用卡潜在客户识别案例研究。我必须用其相应的WOE值替换所有列的值。我可以用2-3步完成。但是,我想知道是否有办法在一次拍摄中做到这一点。
答案 0 :(得分:1)
您可能需要查看woe package(如果WOE代表证据权重)。
以下是文档中的相关代码段:
library(woe)
res_woe <- woe(Data = mtcars, Independent = "cyl", Continuous = FALSE, Dependent = "am", C_Bin = 10, Bad = 0, Good = 1)
答案 1 :(得分:1)
使用记分卡包,使用woebin(),woebin_plot(),woebin_ply(),iv()
函数很简单。
temp <- credit_data
library(scorecard)
bins <- woebin(dt = temp,y = "targetvariable")
woebin_plot(bins$Income)
WOE_temp <- woebin_ply(temp,bins)
View(WOE_temp)
View(temp[is.na(temp$No.of.dependents),])
IV_values <- iv(dt = temp,y = "target variable")
(IV_values)
答案 2 :(得分:0)
嗨,请按照以下步骤操作:-
第1步:使用信息包计算祸患和四:-
库(模糊连接)
图书馆(信息)
IV <-
Information::create_infotables(data = test_df,
y = "label_column",
parallel =
TRUE)
在“ y”中,我们需要分配标签,在“ data”中,我们需要分配数据框。
步骤2:使用以下功能:- 这是我自己的自定义编写函数,用使用信息包计算的祸患来替换数据框中的实际值:-
woe_replace <- function(df_orig, IV) {
df <- cbind(df_orig)
df_clmtyp <- data.frame(clmtyp = sapply(df, class))
df_col_typ <-
data.frame(clmnm = colnames(df), clmtyp = df_clmtyp$clmtyp)
for (rownm in 1:nrow(df_col_typ)) {
colmn_nm <- toString(df_col_typ[rownm, "clmnm"])
if(colmn_nm %in% names(IV$Tables)){
column_woe_df <- cbind(data.frame(IV$Tables[[toString(df_col_typ[rownm, "clmnm"])]]))
if (df_col_typ[rownm, "clmtyp"] == "factor" | df_col_typ[rownm, "clmtyp"] == "character") {
df <-
dplyr::inner_join(
df,
column_woe_df[,c(colmn_nm,"WOE")],
by = colmn_nm,
type = "inner",
match = "all"
)
df[colmn_nm]<-NULL
colnames(df)[colnames(df)=="WOE"]<-colmn_nm
} else if (df_col_typ[rownm, "clmtyp"] == "numeric" | df_col_typ[rownm, "clmtyp"] == "integer") {
column_woe_df$lv<-as.numeric(str_sub(
column_woe_df[,colmn_nm],
regexpr("\\[", column_woe_df[,colmn_nm]) + 1,
regexpr(",", column_woe_df[,colmn_nm]) - 1
))
column_woe_df$uv<-as.numeric(str_sub(
column_woe_df[,colmn_nm],
regexpr(",", column_woe_df[,colmn_nm]) + 1,
regexpr("\\]", column_woe_df[,colmn_nm]) - 1
))
column_woe_df[colmn_nm]<-NULL
column_woe_df<-column_woe_df[,c("lv","uv","WOE")]
colnames(df)[colnames(df)==colmn_nm]<-"WOE_temp2381111111111111697"
df <-
fuzzy_inner_join(
df,
column_woe_df[,c("lv","uv","WOE")],
by = c("WOE_temp2381111111111111697"="lv","WOE_temp2381111111111111697"="uv"),
match_fun=list(`>=`,`<=`)
)
df["WOE_temp2381111111111111697"]<-NULL
df["lv"]<-NULL
df["uv"]<-NULL
colnames(df)[colnames(df)=="WOE"]<-colmn_nm
}}
}
return(df)
}
函数调用:-
test_df_woe <- woe_replace(test_df, IV)
或超级一杆:-
test_df_woe <- woe_replace(test_df,Information::create_infotables(data = test_df, y = "label_column",parallel =TRUE))