一些样本数据框(实际数据有20个变量的500k观测值):
set.seed(1)
dataframe <- data.frame()
IDs <- as.factor(sample(LETTERS[seq( from = 1, to = 3)], prob = c(0.2, 0.3, 0.5), 1000, replace = TRUE))
Var1 <- sample(x = c(20:1500), size = 1000, replace = TRUE)
Var2 <- sample(x = c(1:15), size = 1000, replace = TRUE)
Var3 <- sample(x = c(0.1:8.5), size = 1000, replace = TRUE)
Var4<- sample(x = c(12:255), size = 1000, replace = TRUE)
Var5 <- sample(x = c(14000000:15000000), size = 1000, replace = TRUE)
dataframe <- data.frame(IDs, Var1, Var2, Var3, Var4, Var5)
dataframe$Var5 <- as.POSIXlt(dataframe$Var5, origin = "1970-01-01")
对于ID
中的每个主题,我想删除Var1中Var1
不在(平均值+/- 0.5标准偏差)范围内的所有行。
我想要走的路是使用dplyr
,管道dataframe
到group_by(ID)
,并应用一个函数。如果是这样,我需要有关函数和dplyr命令的帮助。
我的第一次尝试是使用带ID的for循环:
for(ID in levels(dataframe$IDs)){
# Get 0.5 standard deviations
sd05 <- sd(dataframe[which(dataframe$IDs == ID), "Var1"]) * 0.5
# Get mean for subsetting
mean_for_subset <- mean(dataframe[which(dataframe$IDs == ID), "Var1"])
dataframe[which( dataframe[which(dataframe$IDs == ID), "Var1"] > (mean_for_subset + sd05)
& dataframe[which(dataframe$IDs == ID), "Var1"] < (mean_for_subset - sd05))
,] <- NULL
}
这会发出警告,因为is.na()未应用于矢量或列表,dataframe
仍然有1000个观察值。
答案 0 :(得分:2)
使用data.table:
library(data.table)
dataframe <- data.table(dataframe)
meanV1 <- dataframe[, mean(Var1)]
sdV1 <- 0.5 * dataframe[, sd(Var1)]
dataframe <- dataframe[Var1 < meanV1 + sdV1 & Var1 > meanV1 - sdV1]
如果要通过ID完成:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c("mean1", "sd1") := list(mean(Var1), 0.5 * sd(Var1)), by = IDs]
dataframe <- dataframe[Var1 < mean1 + sd1 & Var1 > mean1 - sd1]
然后删除新行:
dataframe[, c("mean1", "sd1") := NULL]
完成两列:
library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c(
"mean1",
"sd1",
"mean2",
"sd2"
) := list(
mean(Var1),
0.5 * sd(Var1)),
mean(Var2),
0.5 * sd(Var2)),
by = IDs
]
dataframe <- dataframe[
Var1 < mean1 + sd1 &
Var1 > mean1 - sd1 &
Var2 < mean2 + sd2 &
Var2 > mean2 - sd2
]
dataframe[, c("mean1", "sd1", "mean2", "sd2") := NULL]