查找上次观察到的结果的平均值(在不同日期)

时间:2018-01-29 19:13:09

标签: r dataframe

dx = data.frame(
  Day = c(3, 3, 4, 4, 5, 6, 7, 7, 7),
  Name = c("Amy", "Bob", "Jim", "Amy", "Bob", "Jim", "Amy", "Bob", "Jim"),
  Result = c(11:19)
)

   Day Name Result
   3  Amy     11
   3  Bob     12
   4  Jim     13
   4  Amy     14
   5  Bob     15
   6  Jim     16
   7  Amy     17
   7  Bob     18
   7  Jim     19

下表显示了特定日期3名受试者的试验结果。如何创建一个汇总表,显示当天之前的最新观察结果(作为即将到来的结果的比较)?例如,第6天的平均最新观察结果将是Amy的第4天,Bob的第5天,Jim的第4天。对于第7天,它将是Amy / 4,Bob / 5,Jim / 6。

这是我的解决方案,但如果我有数千天的时间和主题,那么for循环似乎效率低下。

output = data.frame(Day = unique(dx$Day)) #Extract unique days
for (i in 1:nrow(output)) {
  dfTemp = dx[dx$Day < dx2[i, "Day"],] #Find all results prior to day
  dfTemp = dfTemp[with(dfTemp, order(Name, -Day)),] #Sort descending by day
  dfTemp = dfTemp[match(unique(dfTemp$Name), dfTemp$Name),] # Take only the 1st row for each person (will be latest result due to sorting above)
  output[i, "AvgLatestResult"] = mean((dfTemp[, "Result"])) #Find mean
}

 Day AvgLatestResult 
 3   NaN  
 4   11.5
 5   13.0
 6   14.0
 7   15.0

3 个答案:

答案 0 :(得分:1)

您可以从长到高(dcast),使用上一个上一个值NA填充na.locf,然后排除第一行以外的所有行(rowMeans(head(..., -1)... ,用NA填充。

library(zoo)
library(data.table)

c(NA, rowMeans(head(na.locf(dcast(dx, Day ~ Name, value.var = "Result")[ , -1]), -1), na.rm = TRUE))
# [1]   NA 11.5 13.0 14.0 15.0

或者展开'dx',每个'Day'和'Name'(CJ)一行,用'Name'填充缺失值(na.locf),创建一个“day index”,计算前一天的意思。

setDT(dx)
d2 <- dx[CJ(Day = unique(dx$Day), Name = unique(dx$Name)), on = .(Day, Name)]
d2[ , Result2 := na.locf(Result, na.rm = FALSE), by = Name]
d2[ , ix := .GRP, by = Day]
d2[ , .(avg = mean(d2[ix == .GRP - 1, Result2], na.rm = TRUE)), by = Day]
#    Day  avg
# 1:   3  NaN
# 2:   4 11.5
# 3:   5 13.0
# 4:   6 14.0
# 5:   7 15.0

答案 1 :(得分:1)

阅读dx创建一个动物园对象z拆分Name以创建一个5 x 3宽的表单对象,其中Days为行,Name为列。然后使用na.locf填充NA值,使用rollapply填充list(-1),意味着使用先前值和mean。这给出了一个动物园对象AvgLatest,我们可以选择使用fortify.zoo将其转换为数据框。 (如果结果是动物园对象,则省略fortify.zoo行。)

library(zoo)

z <- read.zoo(dx, split = "Name")
z.fill <- na.locf(z, na.rm = FALSE)
AvgLatest <- rollapply(z.fill, list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA)
fortify.zoo(AvgLatest, names = "Day")  # optional

,并提供:

  Day AvgLatest
1   3        NA
2   4      11.5
3   5      13.0
4   6      14.0
5   7      15.0

变异

此代码可以使用像这样的magrittr管道来表达:

library(zoo)
library(magrittr)

dx %>%
   read.zoo(split = "Name") %>%
   na.locf(na.rm = FALSE) %>%
   rollapply(list(-1), mean, na.rm = TRUE, by.column = FALSE, fill = NA) %>%
   fortify.zoo(names = "Day")  # optional

答案 2 :(得分:0)

对于较大的数据帧,这可能会更快一些:

# https://stackoverflow.com/questions/7735647/replacing-nas-with-latest-non-na-value
repeat.before = function(x) { 
  ind = which(!is.na(x))     
  if(is.na(x[1]))            
    ind = c(1,ind)       
  rep(x[ind], times = diff(   
    c(ind, length(x) + 1) )) 
}  

day_seq <- data.frame(Day = seq(min(dx$Day), max(dx$Day)))

out <- c(NA,
  rowMeans(
    do.call(
      cbind, by(dx, dx$Name, function(x) {
        out <- merge(x, day_seq, by.x = "Day", by.y = "Day", all.x = TRUE, all.y = TRUE)
        out$Name <- x$Name[1]
        out$Result <- repeat.before(out$Result)
        })),
    na.rm = TRUE))

out[-length(out)]

NA 11.5 13.0 14.0 15.0