R:按分组标准从数据框中删除行

时间:2014-09-04 21:46:16

标签: r subset dplyr

我有一个这样的数据框:

  period     x   y db     perc
1 2013-08-26 4 166 nh 2.409639
2 2013-09-02 5 222 nh 2.252252
3 2013-09-09 3 223 nh 1.345291
4 2013-09-16 9 198 nh 4.545455
5 2013-09-23 3 213 nh 1.408451
6 2013-09-30 5 226 nh 2.212389
...

每个数据库有很多观察结果。我希望创建一个数据帧的子集,排除那个db == 0的x值之和的行,并导出一个逻辑向量removal_candidates,如下所示:

xsums <- by(drawdata$x, drawdata$db, sum)
removal_candidates <- xsums == 0
...

   ne    nf    nh    ni    nj    nl    nm    nn    no    np    nq    nr    nu 
FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

在这个示例中,我想制作一份没有 nf 行的drawdata副本,但drawdata[drawdata$db == removal_candidates]之类的天真尝试不起作用。

-

使用工作解决方案编辑,然后按照下面的回答:

xsums <- by(drawdata$x, drawdata$db, sum)
keep_candidates <- xsums > 0
inc <- names(keep_candidates)[keep_candidates == TRUE]
drawdata <- drawdata[drawdata$db %in% inc,]
drawdata$db <- factor(drawdata$db) # needed to remove 'nf' from 'levels(drawdata$db)'

根据以下@ jazzurro的答案编辑工作解决方案:

library(dplyr)
xs <- drawdata %>% group_by(db) %>% summarise(xsum = sum(x))
removals <- xs$db[xs$xsum == 0]
drawdata <- filter(drawdata, !db %in% removals)
drawdata$db <- factor(drawdata$db)

作为一个新手,xs是一个数据框,这使得@ jazzurro的方法非常容易使用。

-

更详细地阅读dplyr introduction之后的更多版本,并参考dplyr window functions文档:

libary(dplyr)
filter(
    mutate(
        group_by(drawdata, db)
        , sx=sum(x)
    )
, sx>0)

或&#39;链接&#39;像这样

group_by(drawdata, db) %>%  # make a grouping (not visible)
mutate(sx=sum(x) %>%        # add a column based on the grouping
filter(sx>0)                # filter by new column

这很棒。

2 个答案:

答案 0 :(得分:1)

所以你的意思是你要删除db类型的行(例如,nf),如果db的x之和为0那么这样吗?如果是这样,这是我的建议。

period <- seq(as.Date("2013/1/1"), by = "day", length.out = 30)
x <- c(1,2,3,4,5,-5,-4,-3,-2,-1,1,2,3,4,5,-5,-4,-3,-2,-1, 5:14)
db <- rep(c("nh", "nf", "nl"), each = 10)
perc <- runif(30, 0, 5)

foo <- data.frame(period, x, db, perc, stringsAsFactors = F)

library(dplyr)

ana <- foo %>%
    group_by(db) %>%
    summarize(whatever = sum(x))

  db whatever
1 nf        0
2 nh        0
3 nl       95

所以,你想从foo中删除nf和nh。

bob <- ana$db[ana$whatever ==0]

> bob
[1] "nf" "nh"

cathy <- filter(foo, !db %in% bob)

> cathy
   period  x db      perc
1  2013-01-21  5 nl 3.6306351
2  2013-01-22  6 nl 4.9999196
3  2013-01-23  7 nl 3.1791477
4  2013-01-24  8 nl 1.1021805
5  2013-01-25  9 nl 0.3998116
6  2013-01-26 10 nl 0.3279883
7  2013-01-27 11 nl 3.2215079
8  2013-01-28 12 nl 3.0357360
9  2013-01-29 13 nl 2.1077811
10 2013-01-30 14 nl 3.2024951

答案 1 :(得分:0)

尝试以下代码(为清晰起见,修改了示例数据):

drawdata = structure(list(period = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 3L, 
4L, 5L, 6L), .Label = c("2013-08-26", "2013-09-02", "2013-09-09", 
"2013-09-16", "2013-09-23", "2013-09-30"), class = "factor"), 
    x = c(4L, 5L, 3L, 9L, 3L, 5L, 3L, 9L, 3L, 5L), y = c(166L, 
    222L, 223L, 198L, 213L, 226L, 223L, 198L, 213L, 226L), db = structure(c(3L, 
    3L, 2L, 2L, 2L, 1L, 2L, 4L, 5L, 6L), .Label = c("ne", "nf", 
    "nh", "ni", "nj", "nl"), class = "factor"), perc = c(2.409639, 
    2.252252, 1.345291, 4.545455, 1.408451, 2.212389, 1.345291, 
    4.545455, 1.408451, 2.212389)), .Names = c("period", "x", 
"y", "db", "perc"), class = "data.frame", row.names = c(NA, -10L
))

removal_candidates = structure(c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE), .Names = c("ne", "nf", "nh", "ni", "nj", "jl", "nm", 
"nn", "no"))

removal_candidates
   ne    nf    nh    ni    nj    jl    nm    nn    no 
FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 

drawdata
       period x   y db     perc
1  2013-08-26 4 166 nh 2.409639
2  2013-09-02 5 222 nh 2.252252
3  2013-09-09 3 223 nf 1.345291
4  2013-09-16 9 198 nf 4.545455
5  2013-09-23 3 213 nf 1.408451
6  2013-09-30 5 226 ne 2.212389
7  2013-09-09 3 223 nf 1.345291
8  2013-09-16 9 198 ni 4.545455
9  2013-09-23 3 213 nj 1.408451
10 2013-09-30 5 226 nl 2.212389

rc = data.frame(removal_candidates)
rc$id = rownames(rc)
drawdata[!(drawdata$db %in% rc[rc$removal_candidates,]$id),]
       period x   y db     perc
1  2013-08-26 4 166 nh 2.409639
2  2013-09-02 5 222 nh 2.252252
6  2013-09-30 5 226 ne 2.212389
8  2013-09-16 9 198 ni 4.545455
9  2013-09-23 3 213 nj 1.408451
10 2013-09-30 5 226 nl 2.212389