我有一个下面提到的代码:
library(dplyr)
# Create sample data frame
df <- data.frame(
ID = 1:6,
SR1 = c(123,124,125,125,785,849),
SR2 = c("as#12.c", "ae&14.v", "at$19.e", "at$19.d", "ab&22.n", "ab&22.n"),
DRC1 = c("ABC-1", "ABC-1", "AXX-1", "AXX-1", "AWZ-2", "AWZ-5"),
DX2 = c("SXI", "SXI", NA, "SCV", "DDF", "DDF"),
stringsAsFactors = FALSE
)
# Create a function to give Status with each kind of DRC1 according to your rules
StatusJudge <- function(df_sub) {
if (dim(df_sub)[1] == 1) {
df_sub$Status <- FALSE
}
else {
if (all(!is.na(df_sub$DX2))) {
df_sub$Status <-
ifelse(length(unique(df_sub$DX2)) == 1, TRUE, FALSE)
}
else {
df_sub$Status <-
ifelse(length(unique(df_sub$SR1)) == 1 | length(unique(df_sub$SR2)) == 1, TRUE, FALSE)
if (any(!is.na(df_sub$DX2))) {
df_sub$IDfound[is.na(df_sub$DX2)] <-
df_sub$ID[!is.na(df_sub$DX2)][1]
}
}
}
return(df_sub)
}
# Apply the StatusJudge to each element of df_list and then combine the results
df <- df %>%
mutate(Status = NA, IDfound = NA) %>%
group_by(DRC1) %>%
do(StatusJudge(.)) %>%
arrange(ID)
这给出了下面提到的输出:
ID SR1 SR2 DRC1 DX2 Status IDfound
<int> <dbl> <chr> <chr> <chr> <lgl> <int>
1 123 as#12.c ABC-1 SXI TRUE NA
2 124 ae&14.v ABC-1 SXI TRUE NA
3 125 at$19.e AXX-1 <NA> TRUE 4
4 125 at$19.d AXX-1 SCV TRUE NA
5 785 ab&22.n AWZ-2 DDF FALSE NA
6 849 ab&22.n AWZ-5 DDF FALSE NA
这里的问题是我有一个很大的数据集(大约100万行),即使等了将近4个小时后我也没有得到输出,所以花了太多时间。但是相同的代码适用于小数据集(~10K行等)。
请帮助加快此代码。
答案 0 :(得分:1)
请使用您的生产数据测试此data.table
方法。我试图将嵌套的if ... else
和ifelse()
语句转换为布尔表达式。
这似乎与小样本数据集一样有效,但需要对更多测试用例进行全面测试。
library(data.table)
# use boolean expressions instead of if ... else clauses to create Status
setDT(df)[, Status := .N != 1L &&
(all(!is.na(DX2)) && uniqueN(DX2 == 1L) ||
any(is.na(DX2)) && (uniqueN(SR1) == 1L || uniqueN(SR2) == 1L)), by = DRC1][]
# append IDfound column
# create lookup table
mDT <- df[!is.na(DX2), .(DX2 = NA_character_, first(ID)), by = DRC1][]
# join with lookup table and update during join
df[mDT, on = .(DX2, DRC1), IDfound := V2][]
ID SR1 SR2 DRC1 DX2 Status IDfound 1: 1 123 as#12.c ABC-1 SXI TRUE NA 2: 2 124 ae&14.v ABC-1 SXI TRUE NA 3: 3 125 at$19.e AXX-1 NA TRUE 4 4: 4 125 at$19.d AXX-1 SCV TRUE NA 5: 5 785 ab&22.n AWZ-2 DDF FALSE NA 6: 6 849 ab&22.n AWZ-5 DDF FALSE NA
查找表mDT
用于查找DX2
和DRC1
列中的匹配项。 V2
包含每个ID
组中第一行的DRC1
,其中DX2
不 NA
。
mDT
DRC1 DX2 V2 1: ABC-1 NA 1 2: AXX-1 NA 4 3: AWZ-2 NA 5 4: AWZ-5 NA 6
只有df
和DX2
匹配的DRC1
条目才会更新。通过加入,它会查找DRC1
匹配且DX2
为NA
的行。如果找到,则将相应的V2
值复制到列IDfound
。