预测包外的分段lm

时间:2018-06-08 20:36:28

标签: r lm predict

我有一系列来自数百个分段线性模型的输出(使用R中的分段包制作)。我希望能够使用预测功能在新数据上使用这些输出。要清楚,我的工作区中没有分段线性模型对象;我刚刚保存并重新导入了相关输出(例如系数和断点)。出于这个原因,我不能简单地使用分段包中的predict.segmented函数。

下面是一个基于this link的玩具示例,看似很有希望,但与predict.segmented函数的输出不匹配。

library(segmented) 
set.seed(12) 

xx <- 1:100 
zz <- runif(100) 
yy <- 2 + 1.5*pmax(xx-35,0) - 1.5*pmax(xx-70,0) + 
               15*pmax(zz-0.5,0) + rnorm(100,0,2) 
dati <- data.frame(x=xx,y=yy,z=zz) 

out.lm<-lm(y~x,data=dati) 
o<-## S3 method for class 'lm': 
     segmented(out.lm,seg.Z=~x,psi=list(x=c(30,60)), 
           control=seg.control(display=FALSE)) 

# Note that coefficients with U in the name are differences in slopes, not slopes. 
# Compare:
slope(o)
coef(o)[2] + coef(o)[3]
coef(o)[2] + coef(o)[3] + coef(o)[4]

# prediction 
pred <- data.frame(x = 1:100) 
pred$dummy1 <- pmax(pred$x - o$psi[1,2], 0) 
pred$dummy2 <- pmax(pred$x - o$psi[2,2], 0) 
pred$dummy3 <- I(pred$x > o$psi[1,2]) * (coef(o)[2] + coef(o)[3])
pred$dummy4 <- I(pred$x > o$psi[2,2]) * (coef(o)[2] + coef(o)[3] + coef(o)[4]) 
names(pred)[-1]<- names(model.frame(o))[-c(1,2)] 

# compute the prediction, using standard predict function 
# computing confidence intervals further 
# suppose that the breakpoints are fixed 
pred <- data.frame(pred, predict(o, newdata= pred, 
                       interval="confidence")) 

# Try prediction using the predict.segment version to compare
test <- predict.segmented(o)
plot(pred$fit, test, ylim = c(0, 100))
abline(0,1, col = "red")
# At least one segment not being predicted correctly?

我可以使用base r predict()函数(而不是segmented.predict()函数)和从分段线性模型中保存的系数和断点吗?

更新 我发现上面的代码有问题(不要使用它)。通过segmented.predict()函数的一些逆向工程,我生成了设计矩阵并使用它来预测值而不是直接使用predict()函数。我不认为这是对原始问题的完整答案,因为predict()也可以为预测产生置信区间,而我还没有实现 - 对某人来说仍然存在增加置信区间的问题。

library(segmented)


## Define function for making matrix of dummy variables (this is based on code from predict.segmented())
dummy.matrix <- function(x.values, x_names, psi.est = TRUE, nameU, nameV, diffSlope, est.psi) {
  # This function creates a model matrix with dummy variables for a segmented lm with two breakpoints.
  # Inputs:
  # x.values: the x values of the segmented lm
  # x_names: the name of the column of x values
  # psi.est: this is legacy from the predict.segmented function, leave it set to 'TRUE'
  # obj: the segmented lm object
  # nameU: names (class character) of 3rd and 4th coef, which are "U1.x" "U2.x" for lm with two breaks. Example: names(c(obj$coef[3], obj$coef[4]))
  # nameV: names (class character) of 5th and 6th coef, which are "psi1.x" "psi2.x" for lm with two breaks. Example: names(c(obj$coef[5], obj$coef[6]))
  # diffSlope: the coefficients (class numeric) with the slope differences; called U1.x and U2.x for lm with two breaks. Example: c(o$coef[3], o$coef[4])
  # est.psi: the estimated break points (class numeric); these are the estimated breakpoints from segmented.lm. Example: c(obj$psi[1,2], obj$psi[2,2])
  #
  n <- length(x.values)
  k <- length(est.psi)
  PSI <- matrix(rep(est.psi, rep(n, k)), ncol = k)
  newZ <- matrix(x.values, nrow = n, ncol = k, byrow = FALSE)
  dummy1 <- pmax(newZ - PSI, 0)
  if (psi.est) {
    V <- ifelse(newZ > PSI, -1, 0)
    dummy2 <- if (k == 1) 
      V * diffSlope
    else V %*% diag(diffSlope)
    newd <- cbind(x.values, dummy1, dummy2)
    colnames(newd) <- c(x_names, nameU, nameV)
  } else {
    newd <- cbind(x.values, dummy1)
    colnames(newd) <- c(x_names, nameU)
  }
  # if (!x_names %in% names(coef(obj.seg))) 
  #   newd <- newd[, -1, drop = FALSE]
  return(newd)
}

## Test dummy matrix function----------------------------------------------
set.seed(12)
xx<-1:100
zz<-runif(100)
yy<-2+1.5*pmax(xx-35,0)-1.5*pmax(xx-70,0)+15*pmax(zz-.5,0)+rnorm(100,0,2)
dati<-data.frame(x=xx,y=yy,z=zz)
out.lm<-lm(y~x,data=dati)

#1 segmented variable, 2 breakpoints: you have to specify starting values (vector) for psi:
o<-segmented(out.lm,seg.Z=~x,psi=c(30,60),
             control=seg.control(display=FALSE))
slope(o)
plot.segmented(o)
summary(o)


# Test dummy matrix fn with the same dataset
newdata <- dati
nameU1 <- c("U1.x", "U2.x")
nameV1 <- c("psi1.x", "psi2.x")
diffSlope1 <- c(o$coef[3], o$coef[4])
est.psi1 <- c(o$psi[1,2], o$psi[2,2])

test <- dummy.matrix(x.values = newdata$x, x_names = "x", psi.est = TRUE, 
                     nameU = nameU1, nameV = nameV1, diffSlope = diffSlope1, est.psi = est.psi1)


# Predict response variable using matrix multiplication
col1 <- matrix(1, nrow = dim(test)[1])
test <- cbind(col1, test) # Now test is the same as model.matrix(o)
predY <- coef(o) %*% t(test)
plot(predY[1,])
lines(predict.segmented(o), col = "blue") # good, predict.segmented gives same answer

0 个答案:

没有答案