我确定我不是唯一一个问过这个问题的人,但是经过数小时没有运气的搜索,我需要亲自问这个问题。
我有这样的df(rp):
rp <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
agecX中的每个obs指的是父母的孩子的年龄,最多8个孩子。我想创建一个新列“ agec5_12”,其中包含5-12岁年龄最大的孩子的年龄。所以我的df看起来像这样:
rpage <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
agec5_12 = c(7, 12, 11, 11, 10, NA))
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
关于我的数据的注释:
我尝试编写函数并使用rowwise
和mutate
来应用它:
fun.age5_12 <- function(x){
x[which(x == max(x[(x > 4) & (x < 13)], na.rm = TRUE))]
}
rpage <- rp %>%
select(-c(20:21, 199:200)) %>%
rowwise() %>%
mutate(agec5_12 = fun.age5_12(c(1:8)))
但是,这会将所有obs返回为“ 12”。理想情况下,我想使用dplyr进行此操作。任何使用mutate
或ifelse
且不一定带有功能的建议都可以。
谢谢
答案 0 :(得分:2)
我知道您想要tidyverse,但这是一种基本的R方法:
data.frame(
agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA),
stringsAsFactors = FALSE
) -> rp
for (i in 1:nrow(rp)) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
rp[i, "agec5_12"] <- if (length(agec5_12)) max(agec5_12) else NA_integer_
}
rp
## agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
## 1 7 6 2 NA NA NA NA NA 7
## 2 16 12 9 7 4 NA NA NA 12
## 3 11 9 9 9 7 6 NA NA 11
## 4 11 9 9 9 7 6 NA NA 11
## 5 17 16 14 13 10 9 7 5 10
## 6 17 15 NA NA NA NA NA NA NA
for
显示了惯用法,但是sapply()
解决方案要快得多:
rp1$agec5_12 <- sapply(1:nrow(rp), function(i) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
if (length(agec5_12)) max(agec5_12) else NA_integer_
})
答案 1 :(得分:1)
另一种基础R解决方案。我们可以使用replace
来替换5到12之外的数字,然后使用apply
和function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE))
来找到每一行的最大值。您也可以考虑直接使用max
,但是对于元素为NA
的行,max
函数将返回-Inf
。
rp$agec5_12 <- apply(replace(rp, rp > 12 | rp < 5, NA), 1,
function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
或使用do.call
和pmax
。
rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
这里是到目前为止三种基本R方法的性能比较。 do.call
和pmax
似乎是最快的。
library(microbenchmark)
perf <- microbenchmark(
m1 = {sapply(1:nrow(rp), function(i) {
agec5_12 <- unlist(rp[i,], use.names = FALSE)
agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
if (length(agec5_12)) max(agec5_12) else NA_integer_
})},
m2 = {
apply(replace(rp, rp > 12 | rp < 5, NA), 1,
function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
},
m3 = {rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
}, times = 1000L)
perf
# Unit: microseconds
# expr min lq mean median uq max neval cld
# m1 505.318 559.2935 860.3941 608.386 1231.937 9844.699 1000 b
# m2 526.394 568.0325 831.6851 629.205 1207.262 4748.342 1000 b
# m3 384.514 425.1250 635.3154 465.736 918.362 8992.393 1000 a
数据
rp <- data.frame(
agec1 = c(7, 16, 11, 11, 17, 17),
agec2 = c(6, 12, 9, 9, 16, 15),
agec3 = c(2, 9, 9, 9, 14, NA),
agec4 = c(NA, 7, 9, 9, 13, NA),
agec5 = c(NA, 4, 7, 7, 10, NA),
agec6 = c(NA, NA, 6, 6, 9, NA),
agec7 = c(NA, NA, NA, NA, 7, NA),
agec8 = c(NA, NA, NA, NA, 5, NA)
)
答案 2 :(得分:1)
我认为,apply
解决方案将比dplyr
(我假设您是指tidyverse
)解决方案更简单易读,但自您提出要求以来,这是一种方法-
library(dplyr)
library(tidyr)
rp %>%
rownames_to_column("parent_id") %>%
gather(variable, value, -parent_id) %>%
group_by(parent_id) %>%
arrange(parent_id, desc(value)) %>%
mutate(
agec5_12 = value[between(value, 5, 12)][1]
) %>%
ungroup() %>%
spread(variable, value) %>%
select(3:10, 2)
# A tibble: 6 x 9
agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 7 6 2 NA NA NA NA NA 7
2 16 12 9 7 4 NA NA NA 12
3 11 9 9 9 7 6 NA NA 11
4 11 9 9 9 7 6 NA NA 11
5 17 16 14 13 10 9 7 5 10
6 17 15 NA NA NA NA NA NA NA
答案 3 :(得分:1)
自您提出要求以来,这是一种纯粹的dplyr
方法-
max5_12 <- function(x) {
a <- sort(x, decreasing = T)
a[a >= 5 & a <= 12][1]
}
rp %>%
t() %>%
as.data.frame() %>%
bind_rows(
summarise_all(., max5_12)
) %>%
t() %>%
as.data.frame() %>%
setNames(c(names(rp), "agec5_12"))
agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
V1 7 6 2 NA NA NA NA NA 7
V2 16 12 9 7 4 NA NA NA 12
V3 11 9 9 9 7 6 NA NA 11
V4 11 9 9 9 7 6 NA NA 11
V5 17 16 14 13 10 9 7 5 10
V6 17 15 NA NA NA NA NA NA NA
答案 4 :(得分:0)
我想到的最简单的方法是使用dplyr
,purrr
和tidyr
:
library(dplyr)
library(purrr)
library(tidyr)
rp %>%
mutate_at(vars(agec1:agec8), funs(ifelse(between(., 5, 12), ., NA))) %>%%
group_by(id) %>%
nest() %>%
mutate(agec5_12 = map(data, max, na.rm = TRUE),
agec5_12 = ifelse(agec5_12 == -Inf, NA, agec5_12)) %>%
unnest()