我正在努力确定如何生成表格输出,如下所示。 (我希望能够利用条件逻辑来遮蔽单元格,如Excel中生成的附加输出所示,但我很高兴只是理解如何在没有启动器阴影的情况下生成输出。)
整体流程:
加载所需的包。
library(gains)
library(plyr)
library(StatMeasures)
library(sqldf)
library(tables)
full_dataset <- MineThatData
reduced_dataset <- full_dataset[ ,
c("conversion","spend","train","logistic.score")]
reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))
reduced_dataset$score <- round(reduced_dataset$score, 8)
summary(reduced_dataset)
trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
testDF <- reduced_dataset[reduced_dataset$train == 0, ]
trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)
summarize_results_by_decile <- function(Input_DF, Output_DF) {
Output_DF <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
group by Decile
order by Decile
")
temp_df <- sqldf("
select
case when train = 1 then 'Train' else 'Test' end as Sample
,11 as Decile
,count(*) as Num_Records
,sum(conversion) as Num_Converters
,sum(Spend) as Sum_Spend
,min(score) as Min_Score
,max(score) as Max_Score
,round(avg(conversion),4) as Pct_Response
,round(avg(spend),2) as Spend_per_Record
from Input_DF
")
Output_DF <- rbind(Output_DF , temp_df)
Output_DF$Decile <- factor(Output_DF$Decile,
labels =c("1","2","3","4","5","6","7","8","9","10","Total"))
Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records
/ temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters
/ temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")
Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend
/ temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")
Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")
Output_DF$Num_Converters <- format(Output_DF$Num_Converters,
big.mark = ",")
Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,
big.mark = ","))
Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100,
2), nsmall=2), "%", sep="")
Output_DF$Spend_per_Record <- paste("$", sep="",
format(Output_DF$Spend_per_Record, nsmall=2))
return(Output_DF)
}
summary_results_train <- summarize_results_by_decile(trainDF,
summary_results_train)
Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))
assign_decile <- function(score_var, decile_var) {
decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
ifelse(score_var >= Min_Decile_Scores[2], 2,
ifelse(score_var >= Min_Decile_Scores[3], 3,
ifelse(score_var >= Min_Decile_Scores[4], 4,
ifelse(score_var >= Min_Decile_Scores[5], 5,
ifelse(score_var >= Min_Decile_Scores[6], 6,
ifelse(score_var >= Min_Decile_Scores[7], 7,
ifelse(score_var >= Min_Decile_Scores[8], 8,
ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))
return(decile_var)
}
验证用于在训练数据上分配十分位分配的逻辑:
trainDF$Replicate_Decile <- assign_decile(trainDF$score,
trainDF$Replicate_Decile)
table(trainDF$Decile, trainDF$Replicate_Decile)
trainDF$Replicate_Decile <- NULL
testDF$Decile <- assign_decile(testDF$score, testDF$Decile)
summary_results_test <- summarize_results_by_decile(testDF,
summary_results_test)
summary_results <- rbind(summary_results_train, summary_results_test)
summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))
这样做是为了重新排序要显示的列:
summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records",
"Num_Converters", "Sum_Spend", "Pct_of_Records",
"Pct_of_Converters","Pct_of_Spend","Pct_Response",
"Spend_per_Record")]
这样做会影响列名的显示方式:
summary_results <- rename(summary_results,
c("Num_Records" = "# Records",
"Num_Converters" = "# Converters",
"Sum_Spend" = "Total Spend",
"Pct_of_Records" = "% of Records",
"Pct_of_Converters" = "% of Converters",
"Pct_of_Spend" = "% of Spend",
"Pct_Response" = "% Conversion",
"Spend_per_Record" = "$ per Record"))
print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)
这是我第一次发布Stack Overflow,我是一个相对较新的R用户。我希望我的代码是可以理解的!提前感谢您的任何帮助。
答案 0 :(得分:1)
不太好,也没有解决方案:
library(gains)
library(expss)
full_dataset = MineThatData
reduced_dataset = full_dataset[ ,
c("conversion","spend","train","logistic.score")]
reduced_dataset$score = round(reduced_dataset$logistic.score, 8)
summary_fun = function(data){
calc(data,
list(
"# Records" = NROW(data),
"# Converters" = sum(conversion),
"$ Spend" = sum(spend),
"% Conversion" = round(mean(conversion),4)*100,
"$ per Record" = round(mean(spend),2)
)
)
}
reduced_dataset %>%
compute({
decile_points = quantile(score[train==1],
probs = seq(0,1,by = 0.1)
)
decile_points[length(decile_points)] = Inf
# '11 - ' is needed make reverse order
decile = 11 - as.integer(cut(score, decile_points, include.lowest = TRUE))
rm(decile_points) # we don't need this in our dataset
}) %>%
# "|" to suppress variable label
tab_rows("|" = decile, total(label = "Total")) %>%
tab_cols(total(label = "|")) %>%
tab_cells(sheet(conversion,
spend)) %>%
tab_subgroup(train==1) %>%
tab_stat_fun_df(summary_fun, label = "Train") %>%
tab_subgroup(train==0) %>%
tab_stat_fun_df(summary_fun, label = "Test") %>%
tab_pivot(stat_position = "inside_columns") %>%
# calculate percent
do_repeat(i = perl("Records|Converters|Spend"), {
..[gsub("#|\\$", "% of", .item_value, perl = TRUE)] = i/i[.N]*100
}) %>%
# move some columns to the end
keep(!fixed("% Conversion") & !fixed("$ per Record"), other()) %>%
# formating
do_repeat(i = fixed("#"), {
i = format(i, big.mark = ",")
}) %>%
do_repeat(i = fixed("$"), {
i = paste0("$", format(i, big.mark = ","))
}) %>%
do_repeat(i = fixed("%"), {
i = paste0(format(round(i, 1), nsmall=1), "%")
}) %>%
htmlTable()
免责声明:我是'expss'套餐的作者。