Question

我有一个类似于以下的数据框，总共有500列：

Advanced >

我想计算滚动窗口线性回归，其中我的窗口大小为12个数据点，每个连续回归由6个数据点分隔。对于每个回归，“Days”将始终是模型的x分量，y将是其他每个列（B1，后跟B2，B3等）。然后，我想将系数保存为具有现有列标题（B1，B2等）的数据帧。

我认为我的代码很接近，但效果不好。我在动物园图书馆里使用了rollapply。

Probes <- data.frame(Days=seq(0.01, 4.91, 0.01), B1=5:495,B2=-100:390, B3=10:500,B4=-200:290)

如果可能的话，我还想将“xmins”保存到矢量中以添加到数据帧中。这意味着每个回归中使用的最小x值（基本上它是“Days”列中的每6个数字。）谢谢你的帮助。

Answer 1

1）定义一个动物园对象z，其数据包含Probes，其索引取自第一列探针，即Days。注意lm允许y为矩阵定义coefs函数，该函数计算回归系数。最后rollapply超过z。请注意，返回的对象的索引给出了xmin。

library(zoo)

z <- zoo(Probes, Probes[[1]])

coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1])))))
rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")

，并提供：

> head(rz)
     B11 B12  B21 B22 B31 B32  B41 B42
0.01   4 100 -101 100   9 100 -201 100
0.07   4 100 -101 100   9 100 -201 100
0.13   4 100 -101 100   9 100 -201 100
0.19   4 100 -101 100   9 100 -201 100
0.25   4 100 -101 100   9 100 -201 100
0.31   4 100 -101 100   9 100 -201 100

请注意，如果您需要DF <- fortify.zoo(rz)的数据框表示，则可以使用rz。

2）对行号进行rollaplly的替代方法有点类似：

library(zoo)
y <- as.matrix(Probes[-1])
Days <- Probes$Days
n <- nrow(Probes)
coefs <- function(ix) c(unlist(as.data.frame(coef(lm(y ~ Days, subset = ix)))), 
      xmins = Days[ix][1])
r <- rollapply(1:n, 12, by = 6, coefs)

Answer 2

试试这个：

# here are the xmin values you wanted
xmins <- Probes$Days[seq(1,nrow(Probes),6)]

# here we build a function that will run regressions across the columns
# y1 vs x, y2 vs x, y3 vs x...
# you enter the window and by (12/6) in order to limit the interval being
# regressed. this is later called in do.call
runreg <- function(Probes,m,window=12,by=6){

  # beg,end are used to specify the interval
  beg <- seq(1,nrow(Probes),by)[m]
  end <- beg+window-1

  # this is used to go through all the columns
  N <- ncol(Probes)-1
  tmp <- numeric(N)
  # go through each column and store the coefficients in tmp
  for(i in 1:N){
     y <- Probes[[i+1]][beg:end]
     x <- Probes$Days[beg:end]
     tmp[i] <- coef(lm(y~x))[2][[1]]
  }
  # put all our column regressions into a dataframe
  res <- rbind('coeff'=tmp)
  colnames(res) <- colnames(Probes)[-1]

  return(res)
}

# now that we've built the function to do the column regressions
# we just need to go through all the window-ed regressions (row regressions)
res <- do.call(rbind,lapply(1:length(xmins),function(m) runreg(Probes,m)))

# these rownames are the index of the xmin values
rownames(res) <- seq(1,nrow(Probes),6)
res <- data.frame(res,xmins)

Answer 3

您还可以如下使用rollRegres包

# setup data
Probes <- data.frame(
  # I changed the days to be intergers
  Days=seq(1L, 491L, 1L),
  B1=5:495, B2=-100:390, B3=10:500 , B4=-200:290)

# setup grp argument
grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

# estimate coefs. width argument is realtive in grp units
library(rollRegres)
X <- cbind(1, Probes$Days / 100)
Ys <- as.matrix(Probes[, 2:5])
out <- lapply(1:ncol(Ys), function(i)
  roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
out <- do.call(cbind, out)

# only keep the complete.cases and the unique values
colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
out <- out[complete.cases(out), ]
head(out)
#R      B10 B11  B20 B21 B30 B31  B40 B41
#R [1,]   4 100 -101 100   9 100 -201 100
#R [2,]   4 100 -101 100   9 100 -201 100
#R [3,]   4 100 -101 100   9 100 -201 100
#R [4,]   4 100 -101 100   9 100 -201 100
#R [5,]   4 100 -101 100   9 100 -201 100
#R [6,]   4 100 -101 100   9 100 -201 100

该解决方案比zoo解决方案

library(zoo) coefs <- function(z) c(unlist(as.data.frame(coef(lm(z[,-1] ~ z[,1]))))) microbenchmark::microbenchmark(   rollapply = {
    z <- zoo(Probes, Probes[[1]])
    rz <- rollapply(z, 12, by = 6, coefs, by.column = FALSE, align = "left")   },   roll_regres = {
    grp_arg <- as.integer((Probes$Days - 1L) %/% 6)

    X <- cbind(1, Probes$Days / 100)
    Ys <- as.matrix(Probes[, 2:5])
    out <- lapply(1:ncol(Ys), function(i)
      roll_regres.fit(x = X, y = Ys[, i], width = 2L, grp = grp_arg)$coefs)
    out <- do.call(cbind, out)

    colnames(out) <- sapply(1:4, function(i) paste0("B", i, 0:1))
    out <- out[c(T, grp_arg[-1] != head(grp_arg, -1)), ]
    out <- out[complete.cases(out), ]
    head(out)   } )
#R Unit: microseconds
#R        expr       min        lq      mean     median        uq       max neval
#R   rollapply 53392.614 56330.492 59793.106 58363.2825 60902.938 119206.76   100
#R roll_regres   865.186   920.297  1074.161   983.9015  1047.705   5071.41   100

由于版本0.1.0中的验证错误，目前您需要从Github安装软件包。因此，运行

devtools::install_github("boennecd/rollRegres", upgrade_dependencies = FALSE,
                         build_vignettes = TRUE)

在多列数据上使用rollapply和lm

3 个答案: