我想使用R dplyr包来计算以下间隔相关问题而不使用循环:
间隔端点来自列df_abs $ interval和df_rel $ interval。 e.g。
具有数据和间隔的数据框如下所示:
library(dplyr)
# ----------{ data and interval ----------
df_data <- data.frame(varA = NA,
varB = NA,
varC = c(-81.0, -14.3, 29.6, 42.7, 46.4, 57.7, 15.3, 256.3, 20.3, -25.1, -23.1, -17.5))
df_abs <- data.frame(interval = c(-Inf, -60, -30, 0, 30, 60, 100, 200, Inf),
count = NA,
sum = NA)
df_rel <- data.frame(interval = c(0,5,15,50,75,95,100),
count = NA,
sum = NA)
# ---------- data and interval }----------
# ----------{ calculation ----------
# absolute data frame
for (i in 1 : nrow(df_abs)-1) {
# count observation between interval
df_abs$count[i+1] <- summarise(df_data, sum(df_abs$interval[i] < varC & varC <= df_abs$interval[i+1]))
# sum between interval
df_abs$sum[i+1] <- sum(df_data$varC[df_abs$interval[i] < df_data$varC & df_data$varC <= df_abs$interval[i+1]])
}
# relative data frame
df_data_arranged <- df_data %>%
arrange(varC) %>%
mutate(observationPercent = c(1:nrow(df_data)) * 100/length(df_data$varC))
for (i in 1 : nrow(df_rel)-1) {
# count observation between interval
df_rel$count[i+1] <- summarise(df_data_arranged, sum(df_rel$interval[i] < observationPercent & observationPercent <= df_rel$interval[i+1]))
# sum between interval
df_rel$sum[i+1] <- sum(df_data_arranged$varC[df_rel$interval[i] < df_data_arranged$observationPercent & df_data_arranged$observationPercent <= df_rel$interval[i+1]])
}
# ---------- calculation }----------
答案应如下所示:
df_abs <- data.frame(interval = c(-Inf, -60, -30, 0, 30, 60, 100, 200, Inf),
count = c(0,1,0,4,3,3,0,0,1),
sum = c(0,-81,0,-80,65.2,146.8,0,0,256.3))
df_rel <- data.frame(interval = c(0,5,15,50,75,95,100),
count = c(0,0,1,4,3,2,1),
sum = c(0,0,-81,-39.6,92.6,104.1,256.3))
据我了解dplyr软件包,对于这两个问题中的每一个都应该有一个相当简短的直接解决方案,而不必使用循环。
答案 0 :(得分:2)
这可以按如下方式完成:
创建一个新列(mutate
)以确定哪个观察属于哪个区间(通过base::cut
)
按时间间隔(group_by
)
对结果应用您的操作(summarise
dplyr
的{{1}}和n()
如下:
sum