我有data.frame
数据如下。
ind <- c("22", "58", "57", "43", "23", "90", "45", "21", "27", "67",
"17", "50", "71", "49", "92", "36", "62", "83", "02", "95")
gp <- c("G1", "G1", "G1", "G23", "G23", "G28", "G28", "G29", "G53",
"G56", "G56", "G67", "G77", "G77", "G79", "G79", "G79",
"G82", "G82", "G82")
id <- c("T297170", "T304934", "T437551", "T572358", "T572359", "T118839",
"T304962", "T594651", "T113085", "T304969", "T444487", "T296315",
"T305008", "T437400", "T113089", "T305032", "T557004", "T445004",
"T445004", "T78642")
nm <- c("MaskedMarvel", "DecemberSnowflakes", "MaskedMarvel",
"WarmPuppy", "WarmPuppy", "SpringDance", "SpringDance",
"RoastedMarshmallows", "TrickorTreat", "FrisbeeSailing",
"FrisbeeSailing", "GreatPumpkin", "PumpkinHelmet", "PumpkinHelmet",
"GoodSport", "GoodSport", "GoodSport", "GiftGettingSeason",
"EasterBeagle", "EasterBeagle")
data <- data.frame(ind, id, gp, nm)
data
ind id gp nm
1 22 T297170 G1 MaskedMarvel
2 58 T304934 G1 DecemberSnowflakes
3 57 T437551 G1 MaskedMarvel
4 43 T572358 G23 WarmPuppy
5 23 T572359 G23 WarmPuppy
6 90 T118839 G28 SpringDance
7 45 T304962 G28 SpringDance
8 21 T594651 G29 RoastedMarshmallows
9 27 T113085 G53 TrickorTreat
10 67 T304969 G56 FrisbeeSailing
11 17 T444487 G56 FrisbeeSailing
12 50 T296315 G67 GreatPumpkin
13 71 T305008 G77 PumpkinHelmet
14 49 T437400 G77 PumpkinHelmet
15 92 T113089 G79 GoodSport
16 36 T305032 G79 GoodSport
17 62 T557004 G79 GoodSport
18 83 T445004 G82 GiftGettingSeason
19 02 T445004 G82 EasterBeagle
20 95 T78642 G82 EasterBeagle
我想只保留那些组(在gp
中指定的列nm
中有不同元素的行)。nm
中包含单个元素的组不应保留。
我使用下面的代码获得了所需的结果。
# Split the data.frame into a list of data.frames by groups
data <- lapply(levels(data$gp),function(i) data[which(data$gp==i),])
# Remove groups with single elements in data$nm
data <- data[!sapply(data, function(i) nrow(i) == 1)]
# Remove groups with multiples of only a single element in data$nm
data <- data[!sapply(seq_along(data), function(i) length(unique(data[[i]][,4])) == 1)]
# cbind the data.frames
data <- do.call(rbind, data)
# fix the levels in factor variables
data <- droplevels(data)
data
ind id gp nm rm
1 22 T297170 G1 MaskedMarvel TRUE
2 58 T304934 G1 DecemberSnowflakes FALSE
3 57 T437551 G1 MaskedMarvel TRUE
18 83 T445004 G82 GiftGettingSeason FALSE
19 02 T445004 G82 EasterBeagle TRUE
20 95 T78642 G82 EasterBeagle TRUE
是否有一种更优雅的方式来实现这一结果,可能只需要在基础R
的一个步骤中完成?
答案 0 :(得分:4)
你可以尝试
data1 <- data[with(data, ave(as.character(nm), gp, FUN=
function(x) length(unique(x)))>1),]
transform(data1, rm=ave(as.character(nm),
gp, FUN=function(x) duplicated(x)|duplicated(x,fromLast=TRUE)))
# ind id gp nm rm
#1 22 T297170 G1 MaskedMarvel TRUE
#2 58 T304934 G1 DecemberSnowflakes FALSE
#3 57 T437551 G1 MaskedMarvel TRUE
#18 83 T445004 G82 GiftGettingSeason FALSE
#19 02 T445004 G82 EasterBeagle TRUE
#20 95 T78642 G82 EasterBeagle TRUE
或使用data.table
setDT(data)[,.SD[length(unique(nm))>1], by=gp][,
rm:= duplicated(nm)|duplicated(nm,fromLast=TRUE) ,by=gp][]
# gp ind id nm rm
#1: G1 22 T297170 MaskedMarvel TRUE
#2: G1 58 T304934 DecemberSnowflakes FALSE
#3: G1 57 T437551 MaskedMarvel TRUE
#4: G82 83 T445004 GiftGettingSeason FALSE
#5: G82 02 T445004 EasterBeagle TRUE
#6: G82 95 T78642 EasterBeagle TRUE
或使用dplyr
library(dplyr)
data %>%
group_by(gp) %>%
filter(n_distinct(nm)>1) %>%
mutate(rm=duplicated(nm)|duplicated(nm,fromLast=TRUE))