我有一个数据,其中一些列定义组,一些列(下面的示例数据中的a1-a4)只有一列中的值,其余列中有NA。
structure(list(gp = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gp1", class = "factor"), id = c(1, 1, 2, 2, 2, 2, 3, 3, 3), name = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), a1 = c(0.4, NA, NA, NA, NA, NA, 0.3, NA, NA), a2 = c(NA, NA, NA, 1, NA, NA, NA, NA, NA), a3 = c(NA, 1.2, NA, NA, NA, NA, NA, NA, NA), a4 = c(NA, NA, 1, NA, NA, NA, NA, NA, 1)), .Names = c("gp", "id", "name", "a1", "a2", "a3", "a4"), row.names = c(NA, -9L), class = "data.frame")
据我所知,只有列a1中的一个实际上有价值而且我不需要单独的行,我想将组内的所有值收集到一行。 我期待下面的内容。
structure(list(gp = structure(c(1L, 1L, 1L), .Label = "gp1", class = "factor"), id = c(1, 2, 3), name = structure(1:3, .Label = c("A", "B", "C"), class = "factor"), a1 = c(0.4, NA, 0.3), a2 = c(NA, 1, NA), a3 = c(1.2, NA, NA), a4 = c(NA, 1, 1)), .Names = c("gp", "id", "name", "a1", "a2", "a3", "a4"), row.names = c(NA, -3L), class = "data.frame")
我怎样才能实现这一目标?如果解决方案使用tidyverse会很棒。
答案 0 :(得分:1)
你可以试试这个
library(tidyverse)
df1 %>%
group_by(gp, id, name) %>%
summarise_all(sum, na.rm = TRUE) %>%
summarise_all(na_if, 0)
# A tibble: 3 x 7
# Groups: gp [?]
# gp id name a1 a2 a3 a4
# <fct> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
#1 gp1 1. A 0.400 NA 1.20 NA
#2 gp1 2. B NA 1. NA 1.
#3 gp1 3. C 0.300 NA NA 1.
最终输出中不会有NA
个但0
s,因此第二次调用summarise_all
。我在此假设0
到a1
列中没有a4
。
这是针对初始数据集中有0
个的情况的解决方案。
sum_NA <- function(x) {
if(all(is.na(x))) {
NA
} else {
sum(x, na.rm = TRUE)
}
}
df2 %>%
group_by(gp, id, name) %>%
summarise_all(sum_NA)
# A tibble: 3 x 7
# Groups: gp, id [?]
# gp id name a1 a2 a3 a4
# <fct> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
#1 gp1 1. A 0. NA 1.20 NA
#2 gp1 2. B NA 0. NA 1.
#3 gp1 3. C 0.300 NA NA 1.
数据强>
df1 <- structure(list(gp = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gp1", class = "factor"), id = c(1, 1, 2, 2, 2, 2, 3, 3, 3), name = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), a1 = c(0.4, NA, NA, NA, NA, NA, 0.3, NA, NA), a2 = c(NA, NA, NA, 1, NA, NA, NA, NA, NA), a3 = c(NA, 1.2, NA, NA, NA, NA, NA, NA, NA), a4 = c(NA, NA, 1, NA, NA, NA, NA, NA, 1)), .Names = c("gp", "id", "name", "a1", "a2", "a3", "a4"), row.names = c(NA, -9L), class = "data.frame")
df2 <- structure(list(gp = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gp1", class = "factor"), id = c(1, 1, 2, 2, 2, 2, 3, 3, 3), name = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), a1 = c(0.0, NA, NA, NA, NA, NA, 0.3, NA, NA), a2 = c(NA, NA, NA, 0, NA, NA, NA, NA, NA), a3 = c(NA, 1.2, NA, NA, NA, NA, NA, NA, NA), a4 = c(NA, NA, 1, NA, NA, NA, NA, NA, 1)), .Names = c("gp", "id", "name", "a1", "a2", "a3", "a4"), row.names = c(NA, -9L), class = "data.frame")
答案 1 :(得分:1)
由于OP已经提到只有一行具有值,因此在应用dplyr::first
后,一个选项可以是group_by
。我更倾向于使用summarise_at
来灵活地排除几个不需要分析的列。
library(dplyr)
df %>% group_by(gp, id, name) %>%
summarise_at(vars(starts_with("a")), funs(dplyr::first(sort(.)))) %>%
as.data.frame()
# gp id name a1 a2 a3 a4
# 1 gp1 1 A 0.4 NA 1.2 NA
# 2 gp1 2 B NA 1 NA 1
# 3 gp1 3 C 0.3 NA NA 1
答案 2 :(得分:0)
dat%>%
group_by(gp,id,name)%>%
summarise_all(funs(lift(coalesce)(.)))
# A tibble: 3 x 7
# Groups: gp, id [?]
gp id name a1 a2 a3 a4
<fct> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
1 gp1 1. A 0.400 NA 1.20 NA
2 gp1 2. B NA 1. NA 1.
3 gp1 3. C 0.300 NA NA 1.