Question

我有以下数据，其中包含不同销售人员的每月销售额

df_monthofsuccess
M1_Sales    M2_Sales    M3_Sales    M4_Sales    M5_Sales    M6_Sales
15000       16435       12144       55536       75260       15002
35853       41020       66689       0           51495       11725
2500        24600       0   0       3000        0           12445
80654       0           50625       275946      37320       43000
21578       40000       0   0       20000       0   0       20000

我想找出它们达到1,00,000的月份，并用一个单独的变量捕获该月份，如下所示

M1_Sales    M2_Sales    M3_Sales    M4_Sales    M5_Sales    M6_Sales Month_Target
15000       16435       12144       55536       75260       15002       M5
35853       41020       66689       0           51495       11725       M3
2500        24600       0   0       3000        0           12445       FALSE
80654       0           50625       275946      37320       43000       M3
21578       40000       0   0       20000       0   0       20000       M6

我尝试使用以下代码：

df_success <- data.frame()
for (i in (1:nrow(df_monthofsuccess))){
  #i = 9
  x <- df_monthofsuccess[i,]
  ape_tot = 0
  month = 'FALSE'
  for (j in (2:ncol(x))){
    #j = 2
    ape_tot = ape_tot + x[,j]
    if (ape_tot > 100000) month = names(x)[j]
    x$monthofSuccess <- month
    next
  }
  df_success <- rbind(df_success,x)
}

但是，这不能提供预期的输出，并且很慢。

有人可以帮助我获得预期的结果吗？

Answer 1

我们可以使用apply遍历base R中的行，获取该行的累加总和大于1e5的列索引，提取其中的names第一个元素

df1$Month_Target <- apply(df1, 1, FUN = function(x) sub("_Sales", "", 
         names(which(cumsum(x) >1e5)[1])))
df1$Month_Target
#[1] "M5" "M3" NA   "M3" "M6"

注意：未使用任何软件包。仅base R

或与matrixStats一起使用矢量化方法

library(matrixStats)
m1 <- rowCumsums(as.matrix(df1)) 
substr(names(df1), 1, 2)[max.col(m1 > 1e5, "first") * NA^!(rowSums(m1 > 1e5))]
#[1] "M5" "M3" NA   "M3" "M6"

或使用tidyverse而不进行任何重塑

library(tidyverse)
df1 %>%
    mutate(Month_Target = pmap(., ~ 
         names(which(cumsum(c(...)) >1e5)[1])) %>%
            str_remove("_Sales"))
#  M1_Sales M2_Sales M3_Sales M4_Sales M5_Sales M6_Sales Month_Target
#1    15000    16435    12144    55536    75260    15002           M5
#2    35853    41020    66689        0    51495    11725           M3
#3     2500    24600        0     3000        0    12445         <NA>
#4    80654        0    50625   275946    37320    43000           M3
#5    21578    40000        0    20000        0    20000           M6

数据

df1 <- structure(list(M1_Sales = c(15000L, 35853L, 2500L, 80654L, 21578L
), M2_Sales = c(16435L, 41020L, 24600L, 0L, 40000L), M3_Sales = c(12144L, 
66689L, 0L, 50625L, 0L), M4_Sales = c(55536L, 0L, 3000L, 275946L, 
20000L), M5_Sales = c(75260L, 51495L, 0L, 37320L, 0L), M6_Sales = c(15002L, 
11725L, 12445L, 43000L, 20000L)), class = "data.frame", row.names = c(NA, 
-5L))

Answer 2

一种c的方法可能是将tidyverse的数据转换为长格式，当{{1 }}达到100000，然后gather恢复为宽格式。

group_by

总计n列后，获取满足条件的列名

2 个答案:

数据