根据多个列中的条件将值从df输入到新变量中

时间:2018-11-16 02:57:22

标签: r dplyr

我确定我不是唯一一个问过这个问题的人,但是经过数小时没有运气的搜索,我需要亲自问这个问题。

我有这样的df(rp):

rp <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17), 
               agec2 = c(6, 12, 9, 9, 16, 15), 
               agec3 = c(2, 9, 9, 9, 14, NA), 
               agec4 = c(NA, 7, 9, 9, 13, NA), 
               agec5 = c(NA, 4, 7, 7, 10, NA), 
               agec6 = c(NA, NA, 6, 6, 9, NA), 
               agec7 = c(NA, NA, NA, NA, 7, NA), 
               agec8 = c(NA, NA, NA, NA, 5, NA), 
          row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

agecX中的每个obs指的是父母的孩子的年龄,最多8个孩子。我想创建一个新列“ agec5_12”,其中包含5-12岁年龄最大的孩子的年龄。所以我的df看起来像这样:

rpage <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17), 
               agec2 = c(6, 12, 9, 9, 16, 15), 
               agec3 = c(2, 9, 9, 9, 14, NA), 
               agec4 = c(NA, 7, 9, 9, 13, NA), 
               agec5 = c(NA, 4, 7, 7, 10, NA), 
               agec6 = c(NA, NA, 6, 6, 9, NA), 
               agec7 = c(NA, NA, NA, NA, 7, NA), 
               agec8 = c(NA, NA, NA, NA, 5, NA), 
               agec5_12 = c(7, 12, 11, 11, 10, NA))
          row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

关于我的数据的注释:

  • 年龄并不总是按相同的时间顺序排列,即年龄最小到年龄最大或年龄最小到最小年龄
  • 连续可能没有这个年龄段的孩子(在这种情况下,我希望返回NA)

我尝试编写函数并使用rowwisemutate来应用它:

fun.age5_12 <- function(x){
                 x[which(x == max(x[(x > 4) & (x < 13)], na.rm = TRUE))]
                }
rpage <- rp %>%
         select(-c(20:21, 199:200)) %>%
         rowwise() %>% 
         mutate(agec5_12 = fun.age5_12(c(1:8)))

但是,这会将所有obs返回为“ 12”。理想情况下,我想使用dplyr进行此操作。任何使用mutateifelse且不一定带有功能的建议都可以。

谢谢

5 个答案:

答案 0 :(得分:2)

我知道您想要tidyverse,但这是一种基本的R方法:

data.frame(
  agec1 = c(7, 16, 11, 11, 17, 17), 
  agec2 = c(6, 12, 9, 9, 16, 15), 
  agec3 = c(2, 9, 9, 9, 14, NA), 
  agec4 = c(NA, 7, 9, 9, 13, NA), 
  agec5 = c(NA, 4, 7, 7, 10, NA), 
  agec6 = c(NA, NA, 6, 6, 9, NA), 
  agec7 = c(NA, NA, NA, NA, 7, NA), 
  agec8 = c(NA, NA, NA, NA, 5, NA), 
  stringsAsFactors = FALSE
) -> rp

for (i in 1:nrow(rp)) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  rp[i, "agec5_12"] <- if (length(agec5_12)) max(agec5_12) else NA_integer_
}

rp
##   agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
## 1     7     6     2    NA    NA    NA    NA    NA        7
## 2    16    12     9     7     4    NA    NA    NA       12
## 3    11     9     9     9     7     6    NA    NA       11
## 4    11     9     9     9     7     6    NA    NA       11
## 5    17    16    14    13    10     9     7     5       10
## 6    17    15    NA    NA    NA    NA    NA    NA       NA

for显示了惯用法,但是sapply()解决方案要快得多:

rp1$agec5_12 <- sapply(1:nrow(rp), function(i) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  if (length(agec5_12)) max(agec5_12) else NA_integer_
})

答案 1 :(得分:1)

另一种基础R解决方案。我们可以使用replace来替换5到12之外的数字,然后使用applyfunction(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE))来找到每一行的最大值。您也可以考虑直接使用max,但是对于元素为NA的行,max函数将返回-Inf

rp$agec5_12 <- apply(replace(rp, rp > 12 | rp < 5, NA), 1, 
                     function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))

或使用do.callpmax

rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))

这里是到目前为止三种基本R方法的性能比较。 do.callpmax似乎是最快的。

library(microbenchmark)

perf <- microbenchmark(
  m1 = {sapply(1:nrow(rp), function(i) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  if (length(agec5_12)) max(agec5_12) else NA_integer_
})},
  m2 = {
    apply(replace(rp, rp > 12 | rp < 5, NA), 1, 
          function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
  },
  m3 = {rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
}, times = 1000L) 

perf
# Unit: microseconds
# expr     min       lq     mean  median       uq      max neval cld
#   m1 505.318 559.2935 860.3941 608.386 1231.937 9844.699  1000   b
#   m2 526.394 568.0325 831.6851 629.205 1207.262 4748.342  1000   b
#   m3 384.514 425.1250 635.3154 465.736  918.362 8992.393  1000  a 

数据

rp <- data.frame(
  agec1 = c(7, 16, 11, 11, 17, 17), 
  agec2 = c(6, 12, 9, 9, 16, 15), 
  agec3 = c(2, 9, 9, 9, 14, NA), 
  agec4 = c(NA, 7, 9, 9, 13, NA), 
  agec5 = c(NA, 4, 7, 7, 10, NA), 
  agec6 = c(NA, NA, 6, 6, 9, NA), 
  agec7 = c(NA, NA, NA, NA, 7, NA), 
  agec8 = c(NA, NA, NA, NA, 5, NA)
) 

答案 2 :(得分:1)

我认为,apply解决方案将比dplyr(我假设您是指tidyverse)解决方案更简单易读,但自您提出要求以来,这是一种方法-

library(dplyr)
library(tidyr)

rp %>% 
  rownames_to_column("parent_id") %>% 
  gather(variable, value, -parent_id) %>% 
  group_by(parent_id) %>%
  arrange(parent_id, desc(value)) %>% 
  mutate(
    agec5_12 = value[between(value, 5, 12)][1]
  ) %>%
  ungroup() %>% 
  spread(variable, value) %>% 
  select(3:10, 2)

# A tibble: 6 x 9
  agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
1     7     6     2    NA    NA    NA    NA    NA        7
2    16    12     9     7     4    NA    NA    NA       12
3    11     9     9     9     7     6    NA    NA       11
4    11     9     9     9     7     6    NA    NA       11
5    17    16    14    13    10     9     7     5       10
6    17    15    NA    NA    NA    NA    NA    NA       NA

答案 3 :(得分:1)

自您提出要求以来,这是一种纯粹的dplyr方法-

max5_12 <- function(x) {
  a <- sort(x, decreasing = T)
  a[a >= 5 & a <= 12][1]
}

rp %>% 
  t() %>% 
  as.data.frame() %>% 
  bind_rows(
   summarise_all(., max5_12)
  ) %>% 
  t() %>% 
  as.data.frame() %>% 
  setNames(c(names(rp), "agec5_12"))

   agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
V1     7     6     2    NA    NA    NA    NA    NA        7
V2    16    12     9     7     4    NA    NA    NA       12
V3    11     9     9     9     7     6    NA    NA       11
V4    11     9     9     9     7     6    NA    NA       11
V5    17    16    14    13    10     9     7     5       10
V6    17    15    NA    NA    NA    NA    NA    NA       NA

答案 4 :(得分:0)

我想到的最简单的方法是使用dplyrpurrrtidyr

library(dplyr)
library(purrr)
library(tidyr)
rp %>%
  mutate_at(vars(agec1:agec8), funs(ifelse(between(., 5, 12), ., NA))) %>%%
  group_by(id) %>%
  nest() %>%
  mutate(agec5_12 = map(data, max, na.rm = TRUE),
         agec5_12 = ifelse(agec5_12 == -Inf, NA, agec5_12)) %>%
  unnest()