将回归模型拟合到多个独立变量和因变量,并通过分组变量获得单独拟合

时间:2015-09-02 22:16:22

标签: r nls

让我们再试一次......

使用mtcars数据集我想使用相同的模型将非线性回归模型拟合到多个相关和独立变量。假设我想使用变量disp,hp和wt来解释mpg和drat。在拟合模型之后,我想要计算平方的总和和剩余的平方和,并将它们存储在矩阵中。这可以通过......很长的路来完成。

dt <- data.frame(mtcars)
m1 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg <- sum((dt$mpg - mean(dt$mpg))^2)
TSS.drat <- sum((dt$drat - mean(dt$drat))^2)
RSS.m1 <- sum(residuals(m1)^2)
RSS.m2 <- sum(residuals(m2)^2)
RSS.m3 <- sum(residuals(m3)^2)
RSS.m4 <- sum(residuals(m4)^2)
RSS.m5 <- sum(residuals(m5)^2)
RSS.m6 <- sum(residuals(m6)^2)

sumsqu <- matrix(0,6,2)
sumsqu[1:3,1] <- TSS.mpg
sumsqu[4:6,1] <- TSS.drat
sumsqu[1,2] <- RSS.m1
sumsqu[2,2] <- RSS.m2
sumsqu[3,2] <- RSS.m3
sumsqu[4,2] <- RSS.m4
sumsqu[5,2] <- RSS.m5
sumsqu[6,2] <- RSS.m6

因此,最终结果是一个矩阵,第一列为平方和,第二列为剩余平方和。现在,让我们通过包含分组因子使其更加复杂。我想做同样的模型拟合和SS提取但是基于变量“am”的两个组,其中am = 0或1.最终结果将是类似于第1部分的矩阵,但是有四列,前两个am = 0的列和am = 1的第二列。再次,这可以通过...来完成。

#subset the data (am = 0) and refit models
dt0 <- subset(dt, am == 0)
m1.0 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2.0 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3.0 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4.0 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5.0 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6.0 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt0, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg.0 <- sum((dt0$mpg - mean(dt0$mpg))^2)
TSS.drat.0 <- sum((dt0$drat - mean(dt0$drat))^2)
RSS.m1.0 <- sum(residuals(m1.0)^2)
RSS.m2.0 <- sum(residuals(m2.0)^2)
RSS.m3.0 <- sum(residuals(m3.0)^2)
RSS.m4.0 <- sum(residuals(m4.0)^2)
RSS.m5.0 <- sum(residuals(m5.0)^2)
RSS.m6.0 <- sum(residuals(m6.0)^2)

sumsqu.0 <- matrix(0,6,2)
sumsqu.0[1:3,1] <- TSS.mpg.0
sumsqu.0[4:6,1] <- TSS.drat.0
sumsqu.0[1,2] <- RSS.m1.0
sumsqu.0[2,2] <- RSS.m2.0
sumsqu.0[3,2] <- RSS.m3.0
sumsqu.0[4,2] <- RSS.m4.0
sumsqu.0[5,2] <- RSS.m5.0
sumsqu.0[6,2] <- RSS.m6.0

#subset the data (am=1) and refit models
dt1 <- subset(dt, am == 1)
m1.1 <- nls(mpg ~ B0*(disp^B1)*exp(B2*disp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m2.1 <- nls(mpg ~ B0*(hp^B1)*exp(B2*hp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m3.1 <- nls(mpg ~ B0*(wt^B1)*exp(B2*wt), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m4.1 <- nls(drat ~ B0*(disp^B1)*exp(B2*disp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m5.1 <- nls(drat ~ B0*(hp^B1)*exp(B2*hp), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
m6.1 <- nls(drat ~ B0*(wt^B1)*exp(B2*wt), data=dt1, start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
TSS.mpg.1 <- sum((dt1$mpg - mean(dt1$mpg))^2)
TSS.drat.1 <- sum((dt1$drat - mean(dt1$drat))^2)
RSS.m1.1 <- sum(residuals(m1.1)^2)
RSS.m2.1 <- sum(residuals(m2.1)^2)
RSS.m3.1 <- sum(residuals(m3.1)^2)
RSS.m4.1 <- sum(residuals(m4.1)^2)
RSS.m5.1 <- sum(residuals(m5.1)^2)
RSS.m6.1 <- sum(residuals(m6.1)^2)

sumsqu.1 <- matrix(0,6,2)
sumsqu.1[1:3,1] <- TSS.mpg.1
sumsqu.1[4:6,1] <- TSS.drat.1
sumsqu.1[1,2] <- RSS.m1.1
sumsqu.1[2,2] <- RSS.m2.1
sumsqu.1[3,2] <- RSS.m3.1
sumsqu.1[4,2] <- RSS.m4.1
sumsqu.1[5,2] <- RSS.m5.1
sumsqu.1[6,2] <- RSS.m6.1

#combine sumsqu.1 and sumsqu.0
allSS <- cbind(sumsqu.0,sumsqu.1)
allSS

正如您所看到的那样,我知道该怎么做的过程相当漫长。现在想象我的真实问题有6个因变量,7个独立变量,5个组,并从每个拟合中提取10个左右的数字。从我的代码中你可以看到我不是程序员,因为我的方法非常低效。我认为我可以包含某种函数然后使用一些应用函数,例如..

nls1 <- function(x,y){
m1 <- nls( y ~ B0*(x^B1)*exp(B2*x), data=dt0, start=c(B0 = 3.5, B1 = 0.2, B2 = 0.0007))
RSS <- sum(residuals(m1)^2)
TSS <- sum((y - mean(y))^2)
RSS
TSS
}

非常感谢任何有助于提高此流程效率的帮助。

2 个答案:

答案 0 :(得分:4)

这里我使用2个因变量(drat,mpg),3个独立变量(disp,hp,wt)和1个具有2个级别/类别的分组变量(am为1/0)。

Webview

正如您所看到的,上述方法可以重塑数据并创建一个包含所有y和x变量组合的更大数据集。如果您最终拥有庞大的数据集,则可能会遇到问题。或者,也许遇到类似问题的其他人需要处理长度较大的变量并创建大数据集会产生问题。

最好为每个模型拟合创建我们需要的公式,而不是创建变量组合。这种方法类似于@BondedDust在下面提出的方法。

library(dplyr)
library(tidyr)

# example dataset (picking useful columns)
dt <- data.frame(mtcars) %>% select(drat, mpg, disp, hp, wt, am)

# specify which columns we want as y (dependent) and x (independent)
# grouping variable is specified within the dependent variables
ynames <- c("drat","mpg","am")
xnames <- c("disp","hp","wt")

# create and reshape datasets
dt1 <- dt[,ynames]
dt1 <- gather(dt1, y, yvalue, -am)

dt2 <- dt[,xnames]
dt2 <- gather(dt2, x, xvalue)


dt1 %>% 
  group_by(y) %>%                 
  do(data.frame(.,dt2)) %>%
  group_by(y,x,am) %>% 
  do({ m1 <- nls( yvalue ~ B0*(xvalue^B1)*exp(B2*xvalue), data=., start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
  RSS <- sum(residuals(m1)^2)
  TSS <- sum((.$yvalue - mean(.$yvalue))^2)
  data.frame(RSS,TSS) })

#       y    x am         RSS        TSS
# 1  drat disp  0   1.3090406   2.770242
# 2  drat disp  1   1.1155372   1.590400
# 3  drat   hp  0   2.1707337   2.770242
# 4  drat   hp  1   0.8342527   1.590400
# 5  drat   wt  0   2.2100162   2.770242
# 6  drat   wt  1   1.1885811   1.590400
# 7   mpg disp  0  98.4815286 264.587368
# 8   mpg disp  1  46.8674036 456.309231
# 9   mpg   hp  0  74.9295161 264.587368
# 10  mpg   hp  1 112.5548955 456.309231
# 11  mpg   wt  0 104.2894519 264.587368
# 12  mpg   wt  1  71.1402536 456.309231

答案 1 :(得分:1)

您可以尝试以下内容:

vars <- expand.grid( Y = c('a1','a2','a3'), X=c('b1','b2','b3','b4'))
models_list <-  lapply( apply(vars, 1, 
                                 function(x) as.formula(paste(x[1], x[2], sep= "~") ) ),   
                        function(form) summary(lm(form=form, data= your_df) )
                        )

aov替换为lm可能会为您提供更多您喜欢的内容。请参阅?summary.aov并使用其中的示例,了解您的最终目的可能需要哪些组件。