dx = data.frame(
Day = c(3, 3, 4, 4, 5, 6, 7, 7, 7),
Name = c("Amy", "Bob", "Jim", "Amy", "Bob", "Jim", "Amy", "Bob", "Jim"),
Result = c(11:19)
)
Day Name Result
3 Amy 11
3 Bob 12
4 Jim 13
4 Amy 14
5 Bob 15
6 Jim 16
7 Amy 17
7 Bob 18
7 Jim 19
下表显示了特定日期3名受试者的试验结果。如何创建一个汇总表,显示当天之前的最新观察结果(作为即将到来的结果的比较)?例如,第6天的平均最新观察结果将是Amy的第4天,Bob的第5天,Jim的第4天。对于第7天,它将是Amy / 4,Bob / 5,Jim / 6。
这是我的解决方案,但如果我有数千天的时间和主题,那么for循环似乎效率低下。
output = data.frame(Day = unique(dx$Day)) #Extract unique days
for (i in 1:nrow(output)) {
dfTemp = dx[dx$Day < dx2[i, "Day"],] #Find all results prior to day
dfTemp = dfTemp[with(dfTemp, order(Name, -Day)),] #Sort descending by day
dfTemp = dfTemp[match(unique(dfTemp$Name), dfTemp$Name),] # Take only the 1st row for each person (will be latest result due to sorting above)
output[i, "AvgLatestResult"] = mean((dfTemp[, "Result"])) #Find mean
}
Day AvgLatestResult
3 NaN
4 11.5
5 13.0
6 14.0
7 15.0
答案 0 :(得分:1)
您可以从长到高(dcast
),使用上一个上一个值NA
填充na.locf
,然后排除第一行以外的所有行(rowMeans(head(..., -1)...
,用NA
填充。
library(zoo)
library(data.table)
c(NA, rowMeans(head(na.locf(dcast(dx, Day ~ Name, value.var = "Result")[ , -1]), -1), na.rm = TRUE))
# [1] NA 11.5 13.0 14.0 15.0
或者展开'dx',每个'Day'和'Name'(CJ
)一行,用'Name'填充缺失值(na.locf
),创建一个“day index”,计算前一天的意思。
setDT(dx)
d2 <- dx[CJ(Day = unique(dx$Day), Name = unique(dx$Name)), on = .(Day, Name)]
d2[ , Result2 := na.locf(Result, na.rm = FALSE), by = Name]
d2[ , ix := .GRP, by = Day]
d2[ , .(avg = mean(d2[ix == .GRP - 1, Result2], na.rm = TRUE)), by = Day]
# Day avg
# 1: 3 NaN
# 2: 4 11.5
# 3: 5 13.0
# 4: 6 14.0
# 5: 7 15.0
答案 1 :(得分:1)
阅读dx
创建一个动物园对象z
拆分Name
以创建一个5 x 3宽的表单对象,其中Days为行,Name为列。然后使用na.locf
填充NA值,使用rollapply
填充list(-1)
,意味着使用先前值和mean
。这给出了一个动物园对象AvgLatest
,我们可以选择使用fortify.zoo
将其转换为数据框。 (如果结果是动物园对象,则省略fortify.zoo
行。)
library(zoo)
z <- read.zoo(dx, split = "Name")
z.fill <- na.locf(z, na.rm = FALSE)
AvgLatest <- rollapply(z.fill, list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA)
fortify.zoo(AvgLatest, names = "Day") # optional
,并提供:
Day AvgLatest
1 3 NA
2 4 11.5
3 5 13.0
4 6 14.0
5 7 15.0
此代码可以使用像这样的magrittr管道来表达:
library(zoo)
library(magrittr)
dx %>%
read.zoo(split = "Name") %>%
na.locf(na.rm = FALSE) %>%
rollapply(list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA) %>%
fortify.zoo(names = "Day") # optional
答案 2 :(得分:0)
对于较大的数据帧,这可能会更快一些:
# https://stackoverflow.com/questions/7735647/replacing-nas-with-latest-non-na-value
repeat.before = function(x) {
ind = which(!is.na(x))
if(is.na(x[1]))
ind = c(1,ind)
rep(x[ind], times = diff(
c(ind, length(x) + 1) ))
}
day_seq <- data.frame(Day = seq(min(dx$Day), max(dx$Day)))
out <- c(NA,
rowMeans(
do.call(
cbind, by(dx, dx$Name, function(x) {
out <- merge(x, day_seq, by.x = "Day", by.y = "Day", all.x = TRUE, all.y = TRUE)
out$Name <- x$Name[1]
out$Result <- repeat.before(out$Result)
})),
na.rm = TRUE))
out[-length(out)]
NA 11.5 13.0 14.0 15.0