我有一个这样的数据框:
period x y db perc
1 2013-08-26 4 166 nh 2.409639
2 2013-09-02 5 222 nh 2.252252
3 2013-09-09 3 223 nh 1.345291
4 2013-09-16 9 198 nh 4.545455
5 2013-09-23 3 213 nh 1.408451
6 2013-09-30 5 226 nh 2.212389
...
每个数据库有很多观察结果。我希望创建一个数据帧的子集,排除那个db == 0的x值之和的行,并导出一个逻辑向量removal_candidates
,如下所示:
xsums <- by(drawdata$x, drawdata$db, sum)
removal_candidates <- xsums == 0
...
ne nf nh ni nj nl nm nn no np nq nr nu
FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
在这个示例中,我想制作一份没有 nf 行的drawdata副本,但drawdata[drawdata$db == removal_candidates]
之类的天真尝试不起作用。
-
使用工作解决方案编辑,然后按照下面的回答:
xsums <- by(drawdata$x, drawdata$db, sum)
keep_candidates <- xsums > 0
inc <- names(keep_candidates)[keep_candidates == TRUE]
drawdata <- drawdata[drawdata$db %in% inc,]
drawdata$db <- factor(drawdata$db) # needed to remove 'nf' from 'levels(drawdata$db)'
根据以下@ jazzurro的答案编辑工作解决方案:
library(dplyr)
xs <- drawdata %>% group_by(db) %>% summarise(xsum = sum(x))
removals <- xs$db[xs$xsum == 0]
drawdata <- filter(drawdata, !db %in% removals)
drawdata$db <- factor(drawdata$db)
作为一个新手,xs是一个数据框,这使得@ jazzurro的方法非常容易使用。
-
更详细地阅读dplyr introduction之后的更多版本,并参考dplyr window functions文档:
libary(dplyr)
filter(
mutate(
group_by(drawdata, db)
, sx=sum(x)
)
, sx>0)
或&#39;链接&#39;像这样
group_by(drawdata, db) %>% # make a grouping (not visible)
mutate(sx=sum(x) %>% # add a column based on the grouping
filter(sx>0) # filter by new column
这很棒。
答案 0 :(得分:1)
所以你的意思是你要删除db类型的行(例如,nf),如果db的x之和为0那么这样吗?如果是这样,这是我的建议。
period <- seq(as.Date("2013/1/1"), by = "day", length.out = 30)
x <- c(1,2,3,4,5,-5,-4,-3,-2,-1,1,2,3,4,5,-5,-4,-3,-2,-1, 5:14)
db <- rep(c("nh", "nf", "nl"), each = 10)
perc <- runif(30, 0, 5)
foo <- data.frame(period, x, db, perc, stringsAsFactors = F)
library(dplyr)
ana <- foo %>%
group_by(db) %>%
summarize(whatever = sum(x))
db whatever
1 nf 0
2 nh 0
3 nl 95
所以,你想从foo中删除nf和nh。
bob <- ana$db[ana$whatever ==0]
> bob
[1] "nf" "nh"
cathy <- filter(foo, !db %in% bob)
> cathy
period x db perc
1 2013-01-21 5 nl 3.6306351
2 2013-01-22 6 nl 4.9999196
3 2013-01-23 7 nl 3.1791477
4 2013-01-24 8 nl 1.1021805
5 2013-01-25 9 nl 0.3998116
6 2013-01-26 10 nl 0.3279883
7 2013-01-27 11 nl 3.2215079
8 2013-01-28 12 nl 3.0357360
9 2013-01-29 13 nl 2.1077811
10 2013-01-30 14 nl 3.2024951
答案 1 :(得分:0)
尝试以下代码(为清晰起见,修改了示例数据):
drawdata = structure(list(period = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 3L,
4L, 5L, 6L), .Label = c("2013-08-26", "2013-09-02", "2013-09-09",
"2013-09-16", "2013-09-23", "2013-09-30"), class = "factor"),
x = c(4L, 5L, 3L, 9L, 3L, 5L, 3L, 9L, 3L, 5L), y = c(166L,
222L, 223L, 198L, 213L, 226L, 223L, 198L, 213L, 226L), db = structure(c(3L,
3L, 2L, 2L, 2L, 1L, 2L, 4L, 5L, 6L), .Label = c("ne", "nf",
"nh", "ni", "nj", "nl"), class = "factor"), perc = c(2.409639,
2.252252, 1.345291, 4.545455, 1.408451, 2.212389, 1.345291,
4.545455, 1.408451, 2.212389)), .Names = c("period", "x",
"y", "db", "perc"), class = "data.frame", row.names = c(NA, -10L
))
removal_candidates = structure(c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE), .Names = c("ne", "nf", "nh", "ni", "nj", "jl", "nm",
"nn", "no"))
removal_candidates
ne nf nh ni nj jl nm nn no
FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
drawdata
period x y db perc
1 2013-08-26 4 166 nh 2.409639
2 2013-09-02 5 222 nh 2.252252
3 2013-09-09 3 223 nf 1.345291
4 2013-09-16 9 198 nf 4.545455
5 2013-09-23 3 213 nf 1.408451
6 2013-09-30 5 226 ne 2.212389
7 2013-09-09 3 223 nf 1.345291
8 2013-09-16 9 198 ni 4.545455
9 2013-09-23 3 213 nj 1.408451
10 2013-09-30 5 226 nl 2.212389
rc = data.frame(removal_candidates)
rc$id = rownames(rc)
drawdata[!(drawdata$db %in% rc[rc$removal_candidates,]$id),]
period x y db perc
1 2013-08-26 4 166 nh 2.409639
2 2013-09-02 5 222 nh 2.252252
6 2013-09-30 5 226 ne 2.212389
8 2013-09-16 9 198 ni 4.545455
9 2013-09-23 3 213 nj 1.408451
10 2013-09-30 5 226 nl 2.212389