我想根据一个值是否出现在30多个列之一中来创建一个新列。
以下是示例数据:
df <- read.table(text =
"ACT_1 ACT_2 ACT_3 ACT_4 ACT_5 ACT_6 ACT_7
DBA ABC ABC ABC ABC ABC ABC
ABC DBA ABC ABC ABC ABC ABC
ABC ABC ABC ABC ABC ABC ABC",
header = TRUE, stringsAsFactors = FALSE)
我想检查名称中包含“ ACT”的所有列,并创建一个新的二进制二进制列1-如果行中包含“ DBA”,则为0-否则。我想使用dplyr。
答案 0 :(得分:1)
另一种涉及使用dplyr
/ tidyr
重塑的方法是
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
gather(key, value, starts_with("ACT")) %>%
group_by(row) %>%
mutate(flag = as.integer(any(value == "DBA"))) %>%
spread(key, value) %>%
ungroup() %>%
select(-row)
# A tibble: 3 x 8
# flag ACT_1 ACT_2 ACT_3 ACT_4 ACT_5 ACT_6 ACT_7
# <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 1 DBA ABC ABC ABC ABC ABC ABC
#2 1 ABC DBA ABC ABC ABC ABC ABC
#3 0 ABC ABC ABC ABC ABC ABC ABC
或者在基数R中,我们也可以使用apply
df$flag <- as.integer(apply(df[grep("^ACT", names(df))] == "DBA", 1, any))
答案 1 :(得分:0)
在base R
中,我们使用grep
来对以“ ACT”(或startsWith
)开头的列进行子集化,检查其是否等于“ DBA”以创建逻辑矩阵,然后获取rowSums
,通过检查大于0的TRUE元素的数量将其转换为逻辑vector
。此逻辑向量使用as.integer
(或+
转换为二进制)
df$newCol <- +(rowSums(df[grep("^ACT", names(df))] == "DBA") > 0)
df$newCol
#[1] 1 1 0
或者另一种base R
方法将Reduce
与lapply
一起使用
df$newCol <- +(Reduce(`|`, lapply(df[grep("^ACT", names(df))], `==`, "DBA")))
注意:两种解决方案都是矢量化的
或使用tidyverse
而不进行任何重塑
library(tidyverse)
df %>%
mutate(newCol = map(., ~.x == "DBA") %>%
reduce(`|`) %>%
as.integer)
# ACT_1 ACT_2 ACT_3 ACT_4 ACT_5 ACT_6 ACT_7 newCol
#1 DBA ABC ABC ABC ABC ABC ABC 1
#2 ABC DBA ABC ABC ABC ABC ABC 1
#3 ABC ABC ABC ABC ABC ABC ABC 0
或使用data.table
library(data.table)
setDT(df)[, newCol := +(Reduce(`+`, lapply(.SD, `==`, "DBA")))]
在示例中,只有“ ACT”列。如果还有其他列,请确保在第一个解决方案中显示的.SDcols
中使用grep
#data
df1 <- df[rep(seq_len(nrow(df)), 1e6), ]
-base R
system.time(+(rowSums(df1[grep("^ACT", names(df1))] == "DBA") > 0))
# user system elapsed
# 0.319 0.101 0.419
system.time(+(Reduce(`|`, lapply(df1[grep("^ACT", names(df1))], `==`, "DBA"))))
# user system elapsed
# 0.152 0.029 0.179
system.time(as.integer(apply(df1[grep("^ACT", names(df1))] == "DBA", 1, any)))
# user system elapsed
# 5.200 0.177 5.344
-tidyverse
system.time({df1 %>%
mutate(row = row_number()) %>%
gather(key, value, starts_with("ACT")) %>%
group_by(row) %>%
mutate(flag = as.integer(any(value == "DBA"))) %>%
spread(key, value) %>%
ungroup() %>%
select(-row)})
# user system elapsed
# 42.750 4.378 47.202
system.time({
df1 %>%
mutate(newCol = map(., ~.x == "DBA") %>%
reduce(`|`))
})
# user system elapsed
# 0.188 0.016 0.203
-data.table
system.time({
setDT(df1)[, newCol := +(Reduce(`+`, lapply(.SD, `==`, "DBA")))]
})
# user system elapsed
# 0.152 0.011 0.163