根据分组,按列方式应用子集或行删除功能

时间:2018-03-05 15:47:44

标签: r subset

一些样本数据框(实际数据有20个变量的500k观测值):

set.seed(1)
dataframe <- data.frame()
IDs <- as.factor(sample(LETTERS[seq( from = 1, to = 3)], prob = c(0.2, 0.3, 0.5), 1000, replace = TRUE))
Var1 <- sample(x = c(20:1500), size = 1000, replace = TRUE)
Var2 <- sample(x = c(1:15), size = 1000, replace = TRUE)
Var3 <- sample(x = c(0.1:8.5), size = 1000, replace = TRUE)
Var4<- sample(x = c(12:255), size = 1000, replace = TRUE)
Var5 <- sample(x = c(14000000:15000000), size = 1000, replace = TRUE)
dataframe <- data.frame(IDs, Var1, Var2, Var3, Var4, Var5)
dataframe$Var5 <- as.POSIXlt(dataframe$Var5, origin = "1970-01-01")

对于ID中的每个主题,我想删除Var1中Var1不在(平均值+/- 0.5标准偏差)范围内的所有行。

我想要走的路是使用dplyr,管道dataframegroup_by(ID),并应用一个函数。如果是这样,我需要有关函数和dplyr命令的帮助。

我的第一次尝试是使用带ID的for循环:

for(ID in levels(dataframe$IDs)){
  # Get 0.5 standard deviations
  sd05 <- sd(dataframe[which(dataframe$IDs == ID), "Var1"]) * 0.5
  # Get mean for subsetting
  mean_for_subset <- mean(dataframe[which(dataframe$IDs == ID), "Var1"])
  dataframe[which( dataframe[which(dataframe$IDs == ID), "Var1"] > (mean_for_subset + sd05)
    & dataframe[which(dataframe$IDs == ID), "Var1"] < (mean_for_subset - sd05))
            ,] <- NULL
}

这会发出警告,因为is.na()未应用于矢量或列表,dataframe仍然有1000个观察值。

1 个答案:

答案 0 :(得分:2)

使用data.table:

library(data.table)
dataframe <- data.table(dataframe)
meanV1 <- dataframe[, mean(Var1)]
sdV1 <- 0.5 * dataframe[, sd(Var1)]
dataframe <- dataframe[Var1 < meanV1 + sdV1 & Var1 > meanV1 - sdV1]

如果要通过ID完成:

library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c("mean1", "sd1") := list(mean(Var1), 0.5 * sd(Var1)), by = IDs]
dataframe <- dataframe[Var1 < mean1 + sd1 & Var1 > mean1 - sd1]

然后删除新行:

dataframe[, c("mean1", "sd1") := NULL]

完成两列:

library(data.table)
dataframe <- data.table(dataframe)
dataframe[, c(
  "mean1",
  "sd1",
  "mean2",
  "sd2"
) := list(
  mean(Var1),
  0.5 * sd(Var1)),
  mean(Var2),
  0.5 * sd(Var2)),
  by = IDs
]

dataframe <- dataframe[
  Var1 < mean1 + sd1 &
  Var1 > mean1 - sd1 &
  Var2 < mean2 + sd2 &
  Var2 > mean2 - sd2
]

dataframe[, c("mean1", "sd1", "mean2", "sd2") := NULL]