R - 子集数据框仅包含超过1条记录的主题

时间:2014-11-05 00:29:21

标签: r dataframe duplicates subset

我希望对数据框进行分组,以包含具有> 1记录的主题的所有记录,并排除那些只有1条记录的主题。

让我们采用以下数据框;

mydata <- data.frame(subject_id = factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10)),
                     variable = rnorm(15))

下面的代码为我提供了使用duplicated();

的&gt; 1记录的主题
duplicates <- mydata[duplicated(mydata$subject_id),]$subject_id

但我希望保留每个主题所有记录的&gt; 1记录,所以我试过了;

mydata[mydata$subject_id==as.factor(duplicates),]

这不会返回我期待的结果。

有什么想法吗?

5 个答案:

答案 0 :(得分:2)

data.table解决方案

set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))

library(data.table)
setDT(mydata)[, .SD[.N > 1], by = subject_id] # @Thanks David.
#    subject_id   variable
# 1:          4 -1.3325937
# 2:          4 -0.4465668
# 3:          5  0.5696061
# 4:          5 -2.8897176
# 5:          6 -0.8690183
# 6:          6 -0.4617027
# 7:          9 -0.1503822
# 8:          9 -0.6281268
# 9:          9  1.3232209

答案 1 :(得分:1)

一个简单的替代方法是使用dplyr

library(dplyr)
dfr <- data.frame(a=sample(1:2,10,rep=T), b=sample(1:5,10, rep=T))
dfr <- group_by(dfr, b)
dfr
# Source: local data frame [10 x 2]
# Groups: b
# 
#    a b
# 1  2 4
# 2  2 2
# 3  2 5
# 4  2 1
# 5  1 2
# 6  1 3
# 7  2 1
# 8  2 4
# 9  1 4
# 10 2 4
filter(dfr, n() > 1)
# Source: local data frame [8 x 2]
# Groups: b
# 
#   a b
# 1 2 4
# 2 2 2
# 3 2 1
# 4 1 2
# 5 2 1
# 6 2 4
# 7 1 4
# 8 2 4

答案 2 :(得分:0)

我必须稍微编辑你的数据框:

set.seed(20)
subject_id <- as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
variable <- rnorm(15)
mydata<-as.data.frame(cbind(subject_id, variable))

现在获取出现多次的主题的所有行:

mydata[duplicated(mydata$subject_id) 
       | duplicated(mydata$subject_id, fromLast = TRUE), ]
#    subject_id   variable
# 4           4 -1.3325937
# 5           4 -0.4465668
# 6           5  0.5696061
# 7           5 -2.8897176
# 8           6 -0.8690183
# 9           6 -0.4617027
# 12          9 -0.1503822
# 13          9 -0.6281268
# 14          9  1.3232209

编辑:这也可以使用您的duplicates向量:

mydata[mydata$subject_id %in% duplicates, ]

答案 3 :(得分:0)

Here you go (I changed your variable to var <- rnorm(15):


set.seed(11)

subject_id<-as.factor(c(1,2,3,4,4,5,5,6,6,7,8,9,9,9,10))
var<-rnorm(15)
mydata<-as.data.frame(cbind(subject_id,var))

x1 <- c(names(table(mydata$subject_id)[table(mydata$subject_id) > 1]))
x2 <- which(mydata$subject_id %in% x1)
mydata[x2,]

     subject_id   var
4           4  0.3951076
5           4 -2.4129058
6           5 -1.3309979
7           5 -1.7354382
8           6  0.4020871
9           6  0.4628287
12          9 -2.1744466
13          9  0.4857337
14          9  1.0245632

答案 4 :(得分:0)

尝试:

> mydata[mydata$subject_id %in% mydata[duplicated(mydata$subject_id),]$subject_id,]
   subject_id   variable
4           4 -1.3325937
5           4 -0.4465668
6           5  0.5696061
7           5 -2.8897176
8           6 -0.8690183
9           6 -0.4617027
12          9 -0.1503822
13          9 -0.6281268
14          9  1.3232209