根据条件保留data.frames列中的行

时间:2014-12-18 09:54:12

标签: r list dataframe apply

我有data.frame数据如下。

ind <- c("22", "58", "57", "43", "23", "90", "45", "21", "27", "67",
        "17", "50", "71", "49", "92", "36", "62", "83", "02", "95")

gp <- c("G1", "G1", "G1", "G23", "G23", "G28", "G28", "G29", "G53",
        "G56", "G56", "G67", "G77", "G77", "G79", "G79", "G79",
        "G82", "G82", "G82")
id <- c("T297170", "T304934", "T437551", "T572358", "T572359", "T118839",
        "T304962", "T594651", "T113085", "T304969", "T444487", "T296315",
        "T305008", "T437400", "T113089", "T305032", "T557004", "T445004",
        "T445004", "T78642")
nm <- c("MaskedMarvel", "DecemberSnowflakes", "MaskedMarvel", 
        "WarmPuppy", "WarmPuppy", "SpringDance", "SpringDance", 
        "RoastedMarshmallows", "TrickorTreat", "FrisbeeSailing",
        "FrisbeeSailing", "GreatPumpkin", "PumpkinHelmet", "PumpkinHelmet",
        "GoodSport", "GoodSport", "GoodSport", "GiftGettingSeason",
        "EasterBeagle", "EasterBeagle")

data <- data.frame(ind, id, gp, nm)
data
   ind      id  gp                  nm
1   22 T297170  G1        MaskedMarvel
2   58 T304934  G1  DecemberSnowflakes
3   57 T437551  G1        MaskedMarvel
4   43 T572358 G23           WarmPuppy
5   23 T572359 G23           WarmPuppy
6   90 T118839 G28         SpringDance
7   45 T304962 G28         SpringDance
8   21 T594651 G29 RoastedMarshmallows
9   27 T113085 G53        TrickorTreat
10  67 T304969 G56      FrisbeeSailing
11  17 T444487 G56      FrisbeeSailing
12  50 T296315 G67        GreatPumpkin
13  71 T305008 G77       PumpkinHelmet
14  49 T437400 G77       PumpkinHelmet
15  92 T113089 G79           GoodSport
16  36 T305032 G79           GoodSport
17  62 T557004 G79           GoodSport
18  83 T445004 G82   GiftGettingSeason
19  02 T445004 G82        EasterBeagle
20  95  T78642 G82        EasterBeagle

我想只保留那些组(在gp中指定的列nm中有不同元素的行)。nm中包含单个元素的组不应保留。

我使用下面的代码获得了所需的结果。

# Split the data.frame into a list of data.frames by groups
data <- lapply(levels(data$gp),function(i) data[which(data$gp==i),])
# Remove groups with single elements in data$nm
data <- data[!sapply(data, function(i) nrow(i) == 1)]
# Remove groups with multiples of only a single element in data$nm  
data <- data[!sapply(seq_along(data), function(i) length(unique(data[[i]][,4])) == 1)]
# cbind the data.frames    
data <- do.call(rbind, data)
# fix the levels in factor variables
data <- droplevels(data)
data
   ind      id  gp                 nm    rm
1   22 T297170  G1       MaskedMarvel  TRUE
2   58 T304934  G1 DecemberSnowflakes FALSE
3   57 T437551  G1       MaskedMarvel  TRUE
18  83 T445004 G82  GiftGettingSeason FALSE
19  02 T445004 G82       EasterBeagle  TRUE
20  95  T78642 G82       EasterBeagle  TRUE

是否有一种更优雅的方式来实现这一结果,可能只需要在基础R的一个步骤中完成?

1 个答案:

答案 0 :(得分:4)

你可以尝试

data1 <- data[with(data, ave(as.character(nm), gp, FUN=
                          function(x) length(unique(x)))>1),]
transform(data1, rm=ave(as.character(nm),
   gp, FUN=function(x) duplicated(x)|duplicated(x,fromLast=TRUE)))
#   ind      id  gp                 nm    rm
#1   22 T297170  G1       MaskedMarvel  TRUE
#2   58 T304934  G1 DecemberSnowflakes FALSE
#3   57 T437551  G1       MaskedMarvel  TRUE
#18  83 T445004 G82  GiftGettingSeason FALSE
#19  02 T445004 G82       EasterBeagle  TRUE
#20  95  T78642 G82       EasterBeagle  TRUE

或使用data.table

 setDT(data)[,.SD[length(unique(nm))>1], by=gp][,
        rm:= duplicated(nm)|duplicated(nm,fromLast=TRUE) ,by=gp][]
 #    gp ind      id                 nm    rm
 #1:  G1  22 T297170       MaskedMarvel  TRUE
 #2:  G1  58 T304934 DecemberSnowflakes FALSE
 #3:  G1  57 T437551       MaskedMarvel  TRUE
 #4: G82  83 T445004  GiftGettingSeason FALSE
 #5: G82  02 T445004       EasterBeagle  TRUE
 #6: G82  95  T78642       EasterBeagle  TRUE

或使用dplyr

 library(dplyr)
 data %>%
      group_by(gp) %>% 
      filter(n_distinct(nm)>1) %>% 
      mutate(rm=duplicated(nm)|duplicated(nm,fromLast=TRUE))