我有一个包含许多列的数据框。例如:
sample treatment col5 col6 col7
1 a 3 0 5
2 a 1 0 3
3 a 0 0 2
4 b 0 1 1
我要选择sample
和treatment
列以及满足以下2个条件的所有列:
treatment == 'b'
为0的行上的值treatment == 'a'
为非 0的一行中的值。预期结果应如下所示:
sample treatment col5
1 a 3
2 a 1
3 a 0
4 b 0
示例数据框:
structure(list(sample = 1:4, treatment = structure(c(1L, 1L,
1L, 2L), .Label = c("a", "b"), class = "factor"), col5 = c(3,
1, 0, 0), col6 = c(0, 0, 0, 1), col7 = c(5, 3, 2, 1)), class = "data.frame", row.names = c(NA,
-4L))
答案 0 :(得分:3)
这是使用基数R的一种方法-
cs_a <- colSums(df[df$treatment == "a",-c(1:2)]) > 0
cs_b <- colSums(df[df$treatment == "b",-c(1:2)]) == 0
df[, c(TRUE, TRUE, cs_a & cs_b)]
sample treatment col5
1 1 a 3
2 2 a 1
3 3 a 0
4 4 b 0
使用dplyr
-
df %>%
select_at(which(c(TRUE, TRUE, cs_a & cs_b)))
答案 1 :(得分:0)
在tidyverse中,这是更详细的方法,不需要针对每个治疗级别的手册colSums
:
library(dplyr)
library(purrr)
library(tidyr)
sample <- 1:4
treatment <- c("a", "a", "a", "b")
col5 <- c(3,1,0,0)
col6 <- c(0,0,0,1)
col7 <- c(5,3,2,1)
dd <- data.frame(sample, treatment, col5, col6, col7)
# first create new columns that report whether the entries are zero
dd2 <- mutate_if(
.tbl = dd,
.predicate = is.numeric,
.funs = function(x)
x == 0
)
# then find the sum per column and per treatment group
# in R TRUE = 1 and FALSE = 0
number_of_zeros <- dd2 %>%
group_by(treatment) %>%
summarise_at(.vars = vars(col5:col7), .funs = "sum")
# then find the names of the columns you want to keep
keeper_columns <-
number_of_zeros %>%
select(-treatment) %>% # remove the treatment grouping variable
map_dfr( # function to check if all entries per column (now per treatment level) are greater zero
.x = .,
.f = function(x)
all(x > 0)
) %>%
gather(column, keeper) %>% # reformat
filter(keeper == TRUE) %>% # to grab the keepers
select(column) %>% # then select the column with column names
unlist %>% # and convert to character vector
unname
# subset the original dataset for the wanted columns
wanted_columns <- dd %>% select(1:2, keeper_columns)