R

时间:2018-02-02 00:20:45

标签: r

我正在努力确定如何生成表格输出,如下所示。 (我希望能够利用条件逻辑来遮蔽单元格,如Excel中生成的附加输出所示,但我很高兴只是理解如何在没有启动器阴影的情况下生成输出。)

所需输出(在Excel中生成) image description

整体流程:

  1. 使用可用于"增益"的MineThatData数据集。包。
  2. 完整数据集包括四个模型分数。只需保留" logistic.score"用于说明。
  3. 根据模型分数(即logistic.score)将Training示例中的每条记录(train = 1)分配到十分位数。
  4. 使用Training示例中的得分范围将Test样本中的记录分配给十分位
  5. 以每个采样组的十分位数报告各种统计数据(包括每个记录的转换率和支出)。
  6. 加载所需的包。

    library(gains)
    library(plyr)
    library(StatMeasures)
    library(sqldf)
    library(tables)
    
    full_dataset <- MineThatData
    
    reduced_dataset <- full_dataset[ , 
                               c("conversion","spend","train","logistic.score")]
    
    reduced_dataset <- rename(reduced_dataset,c("logistic.score"="score"))
    
    reduced_dataset$score <- round(reduced_dataset$score, 8)
    
    summary(reduced_dataset)
    
    trainDF <- reduced_dataset[reduced_dataset$train == 1, ]
    testDF  <- reduced_dataset[reduced_dataset$train == 0, ]
    
    trainDF$Decile <- decile(trainDF$score, decreasing = TRUE)
    
    summarize_results_by_decile <- function(Input_DF, Output_DF) {
      Output_DF <- sqldf("
         select
          case when train = 1 then 'Train' else 'Test' end as Sample
         ,Decile
         ,count(*) as Num_Records
         ,sum(conversion) as Num_Converters
         ,sum(spend) as Sum_Spend
         ,min(score) as Min_Score
         ,max(score) as Max_Score
         ,round(avg(conversion),4) as Pct_Response
         ,round(avg(spend),2) as Spend_per_Record
         from Input_DF
         group by Decile
         order by Decile
         ")
    
      temp_df <- sqldf("
         select
          case when train = 1 then 'Train' else 'Test' end as Sample
         ,11 as Decile
         ,count(*) as Num_Records
         ,sum(conversion) as Num_Converters
         ,sum(Spend) as Sum_Spend
         ,min(score) as Min_Score
         ,max(score) as Max_Score
         ,round(avg(conversion),4) as Pct_Response
         ,round(avg(spend),2) as Spend_per_Record
         from Input_DF
         ")
    
      Output_DF <- rbind(Output_DF , temp_df)
    
      Output_DF$Decile <- factor(Output_DF$Decile, 
                    labels =c("1","2","3","4","5","6","7","8","9","10","Total"))
    
      Output_DF$Pct_of_Records <- paste(format(round(Output_DF$Num_Records    
        / temp_df$Num_Records * 100, 1), nsmall=1), "%", sep="")
    
      Output_DF$Pct_of_Converters <- paste(format(round(Output_DF$Num_Converters 
        / temp_df$Num_Converters * 100, 1), nsmall=1), "%", sep="")
    
      Output_DF$Pct_of_Spend <- paste(format(round(Output_DF$Sum_Spend      
        / temp_df$Sum_Spend * 100, 1), nsmall=1), '%', sep="")
    
      Output_DF$Num_Records <- format(Output_DF$Num_Records, big.mark = ",")
    
      Output_DF$Num_Converters <- format(Output_DF$Num_Converters, 
                                         big.mark = ",")
    
      Output_DF$Sum_Spend <- paste("$" , sep="", format(Output_DF$Sum_Spend,      
                                                        big.mark = ","))
    
      Output_DF$Pct_Response <- paste(format(round(Output_DF$Pct_Response * 100, 
                                            2), nsmall=2), "%", sep="")
    
      Output_DF$Spend_per_Record <- paste("$", sep="", 
                                   format(Output_DF$Spend_per_Record, nsmall=2))
    
    return(Output_DF)
    }
    
    summary_results_train <- summarize_results_by_decile(trainDF, 
                                                         summary_results_train)
    
    Min_Decile_Scores <- t(subset(summary_results_train, select = Min_Score))
    
    assign_decile <- function(score_var, decile_var) {
      decile_var <- ifelse(score_var >= Min_Decile_Scores[1], 1,
                    ifelse(score_var >= Min_Decile_Scores[2], 2,
                    ifelse(score_var >= Min_Decile_Scores[3], 3,
                    ifelse(score_var >= Min_Decile_Scores[4], 4,
                    ifelse(score_var >= Min_Decile_Scores[5], 5,
                    ifelse(score_var >= Min_Decile_Scores[6], 6,
                    ifelse(score_var >= Min_Decile_Scores[7], 7,
                    ifelse(score_var >= Min_Decile_Scores[8], 8,
                    ifelse(score_var >= Min_Decile_Scores[9], 9, 10)))))))))
    
      return(decile_var)
    }
    

    验证用于在训练数据上分配十分位分配的逻辑:

    trainDF$Replicate_Decile <- assign_decile(trainDF$score, 
                                                       trainDF$Replicate_Decile)
    table(trainDF$Decile, trainDF$Replicate_Decile)
    trainDF$Replicate_Decile <- NULL
    
    testDF$Decile <- assign_decile(testDF$score, testDF$Decile)
    
    summary_results_test <- summarize_results_by_decile(testDF, 
                                                        summary_results_test)
    
    summary_results <- rbind(summary_results_train, summary_results_test)
    
    summary_results <- subset(summary_results, select = -c(Min_Score,Max_Score))
    

    这样做是为了重新排序要显示的列:

    summary_results <- summary_results[ ,c("Sample", "Decile", "Num_Records", 
                             "Num_Converters", "Sum_Spend", "Pct_of_Records", 
                             "Pct_of_Converters","Pct_of_Spend","Pct_Response", 
                             "Spend_per_Record")]
    

    这样做会影响列名的显示方式:

    summary_results <- rename(summary_results,
       c("Num_Records"       = "# Records",
         "Num_Converters"    = "# Converters",
         "Sum_Spend"         = "Total Spend",
         "Pct_of_Records"    = "% of Records",
         "Pct_of_Converters" = "% of Converters",
         "Pct_of_Spend"      = "% of Spend",
         "Pct_Response"      = "% Conversion",
         "Spend_per_Record"  = "$ per Record"))
    
    print(summary_results[summary_results$Sample == 'Train', -1], row.names = FALSE)
    print(summary_results[summary_results$Sample == 'Test' , -1], row.names = FALSE)
    

    输出I&#39; m能够在R中生成 image description

    这是我第一次发布Stack Overflow,我是一个相对较新的R用户。我希望我的代码是可以理解的!提前感谢您的任何帮助。

1 个答案:

答案 0 :(得分:1)

不太好,也没有解决方案:

library(gains)
library(expss)

full_dataset = MineThatData

reduced_dataset = full_dataset[ , 
                                 c("conversion","spend","train","logistic.score")]
reduced_dataset$score = round(reduced_dataset$logistic.score, 8)

summary_fun = function(data){
    calc(data, 
         list(
             "# Records" = NROW(data),
             "# Converters" = sum(conversion), 
             "$ Spend" = sum(spend), 
             "% Conversion" = round(mean(conversion),4)*100, 
             "$ per Record" = round(mean(spend),2)    
         )
    )
}

reduced_dataset %>% 
    compute({
        decile_points =  quantile(score[train==1],
                                  probs = seq(0,1,by = 0.1)
        )
        decile_points[length(decile_points)] = Inf
        # '11 - ' is needed make reverse order
        decile = 11 - as.integer(cut(score, decile_points, include.lowest = TRUE)) 
        rm(decile_points) # we don't need this in our dataset

    }) %>% 
    # "|" to suppress variable label
    tab_rows("|" = decile, total(label = "Total")) %>% 
    tab_cols(total(label = "|")) %>% 
    tab_cells(sheet(conversion, 
                    spend)) %>% 
    tab_subgroup(train==1) %>% 
    tab_stat_fun_df(summary_fun, label = "Train") %>% 
    tab_subgroup(train==0) %>% 
    tab_stat_fun_df(summary_fun, label = "Test") %>% 
    tab_pivot(stat_position = "inside_columns") %>% 
    # calculate  percent
    do_repeat(i = perl("Records|Converters|Spend"), {
        ..[gsub("#|\\$", "% of", .item_value, perl = TRUE)] = i/i[.N]*100
    }) %>%
    # move some columns to the end
    keep(!fixed("% Conversion") & !fixed("$ per Record"), other()) %>% 
    # formating
    do_repeat(i = fixed("#"), {
        i = format(i, big.mark = ",")
    }) %>% 
    do_repeat(i = fixed("$"), {
        i = paste0("$", format(i, big.mark = ","))
    }) %>%
    do_repeat(i = fixed("%"), {
        i = paste0(format(round(i, 1), nsmall=1), "%")
    }) %>% 
    htmlTable()

这会得到以下结果:table

免责声明:我是'expss'套餐的作者。