我为我的数据做了一个小例子:
mth <- c(rep(1,10))
day <- c(rep(10,5),rep(11,5))
hr <- c(3,4,5,6,7,3,4,5,6,7)
v <- c(3,4,5,4,3,3,4,5,4,3)
A <- data.frame(cbind(mth,day,hr,v))
我需要做的是获得多少价值&lt;每天4,我尝试使用函数rle
,但无法得到我需要的东西。输出应如下所示:
mth <- c(rep(1,2))
day <- c(10,11)
v <- c(2,2) #each 2 here is the sum of 1(3)+1(3) for Oct. and Nov.
A <- data.frame(cbind(mth,day,v))
感谢您的帮助!
答案 0 :(得分:3)
以下是tapply
的基本解决方案:
> with(A, tapply(v, paste(mth,day, sep="_"), function(x) sum(x<4) ) )
1_10 1_11
2 2
(并且它将比plyr解决方案快得多,但很可能排在data.table方法的第二位。)
答案 1 :(得分:2)
使用ddply很容易
library(plyr)
ddply(A, .(mth, day), function(x) sum(x$v<4))
# mth day V1
# 1 1 10 2
# 2 1 11 2
或者您可以使用summarize
ddply(A, .(mth, day), summarize, less4 = sum(v <4))
# mth day less4
# 1 1 10 2
# 2 1 11 2
答案 2 :(得分:2)
data.table
解决方案
library(data.table)
A <- data.table(A)
A[, sum(v < 4), by = list(mth,day)]
## mth day V1
## 1: 1 10 2
## 2: 1 11 2
# or
A[v<4, .N, by = list(mth,day)]
## mth day N
## 1: 1 10 2
## 2: 1 11 2
# I create a mock dataset of a `year`
library(rbenchmark)
daily <- seq(as.Date("2000/1/1"), by="day", length.out=365)
A <- data.table(mth = month(daily),day = mday(daily))
A <- A[, list(hr = 1:24), by = list(mth,day)]
A[['v']] <- sample(1:10, nrow(A), T)
# set up the various options
ddply1 <- function() ddply(A, .(mth, day), function(x) sum(x$v<4))
ddply2 <- function() ddply(A, .(mth, day), summarize, less4 = sum(v <4))
base_tapply <- function() with(A, tapply(v, paste(mth,day, sep="_"), function(x) sum(x<4) ) )
dt1 <- function() A[, sum(v < 4), by = list(mth,day)]
dt2 <- function() A[v < 4, .N, by = list(mth,day)]
sqldf_ <- function() sqldf("SELECT A.mth,A.day,sum(A.v<4) as sum FROM A GROUP BY day")
benchmark(ddply1(), ddply2(),base_tapply(),dt1(),dt2(), sqldf_(),
replications = 5,
columns = c("test", "replications", "elapsed", "relative","user.self"))
## test replications elapsed relative user.self
## 3 base_tapply() 5 0.08 8 0.08
## 1 ddply1() 5 0.72 72 0.72
## 2 ddply2() 5 1.04 104 1.03
## 4 dt1() 5 0.01 1 0.02
## 5 dt2() 5 0.00 0 0.00
## 6 sqldf_() 5 0.21 21 0.20
答案 3 :(得分:1)
使用sqldf
library(sqldf)
sqldf("SELECT A.mth,A.day,sum(A.v<4) as sum FROM A GROUP BY day")
# mth day sum
#1 1 10 2
#2 1 11 2