在R

时间:2017-02-24 18:23:48

标签: r lapply lm

我有一个数据框,我需要为每个组“网站”运行6个2变量线性模型。然后,我需要将结果转换为数据框。线性模型中的第二个变量发生变化。我使用lapply()将该部分缩小,但我无法弄清楚如何按组运行。我在SO上找到了回答我问题部分的答案,但我无法弄清楚如何将它们放在一起。

以下是一些数据:

structure(list(SiteName = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L), .Label = c("bp10", "bp12"), class = "factor"), 
    DMWT = c(13.9697916666667, 13.9125, 14.2152083333333, 14.7810416666667, 
    15.1541666666667, 15.7535416666667, 17.3254166666667, 18.4872916666667, 
    20.0564583333333, 21.0595833333333, 21.3925), DMAT = c(16.6714631359947, 
    18.474493439025, 20.9517661662977, 23.7017661662978, 25.5957055602372, 
    20.9688840743375, 23.7188840743375, 25.6128234682769, 27.5143386197921, 
    27.6279749834285, 26.1355507410042), ADD = c(0, 0, 0, 1.90367965367967, 
    5.70129870129876, 0, 1.90367965367967, 5.70129870129876, 
    11.400432900433, 17.2132034632037, 21.53354978355), Air200 = c(7.3229782875097, 
    7.40616010569152, 7.50025101478243, 7.63384949963092, 7.78642525720668, 
    7.51736892282216, 7.65096740767065, 7.80354316524641, 7.97854316524641, 
    8.14729316524641, 8.29592952888278), Air100 = c(15.2711601056916, 
    15.362599499631, 15.512902529934, 15.727296469328, 15.9717661662977, 
    15.5300204379738, 15.7444143773677, 15.9888840743374, 16.2306264985798, 
    16.4472174076707, 16.6433537713071), Air75 = c(16.8986348531664, 
    17.0426752572068, 17.1927762673078, 17.3687358632674, 17.5567156612472, 
    17.2098941753475, 17.3858537713071, 17.5738335692869, 17.7820153874687, 
    18.0100961955496, 18.2532275086809), Air50 = c(19.5072207117523, 
    19.6340388935705, 19.7382813178129, 19.8887358632674, 20.1060085905402, 
    19.7553992258526, 19.9058537713072, 20.1231264985799, 20.4400961955496, 
    20.7669143773678, 20.9841871046405), Air10 = c(21.9214631359947, 
    21.5850994996311, 21.2563116208432, 21.1714631359947, 21.4502510147826, 
    21.2734295288829, 21.1885810440344, 21.4673689228223, 21.9696416500951, 
    22.3779749834284, 22.5476719531254)), .Names = c("SiteName", 
"DMWT", "DMAT", "ADD", "Air200", "Air100", "Air75", "Air50", 
"Air10"), row.names = c(547L, 548L, 549L, 550L, 551L, 1593L, 
1594L, 1595L, 1596L, 1597L, 1598L), class = "data.frame")

以下是在模型中使用每个变量的代码。我如何使用这些网站?:

siteslist <- unique(d$SiteName) 
varlist <- names(d)[4:9]
models <- lapply(varlist, function(x) {  # apply the modeling function to our list of air variables
  lm(substitute(DMWT ~ DMAT + i, list(i = as.name(x))), data = d)  # linear model with air variable substituted
})

然后获得模型结果&amp;转换为数据框:

library(relaimpo)
sumfun <- function(x) c(coef(x),
                        summary(x)$adj.r.squared,
                        sqrt(mean(resid(x)^2,na.rm=TRUE)),
                        calc.relimp(x,type="betasq")$betasq[1],
                        calc.relimp(x,type="betasq")$betasq[2],
                        calc.relimp(x,type="pratt")$pratt[1],
                        calc.relimp(x,type="pratt")$pratt[2])
mod.df <- as.data.frame(t(sapply(models,sumfun)))

还尝试将变量和网站组合起来做这样的事情:

siteslist <- unique(d$SiteName)                              
varlist <- names(d)[4:9]
sets <- expand.grid(SiteName = siteslist, Var = varlist)
models <- lapply(1:nrow(sets), function(x) {  # apply the modeling function to our list of air variables
  lm(substitute(DMWT ~ DMAT + i, list(i = as.name(sets$Var[x]))), data = d[d$SiteName ==  sets$SiteName[x],])  # linear model with air variable substituted
})

...但我收到错误"Error in eval(expr, envir, enclos) : object '1' not found"

2 个答案:

答案 0 :(得分:0)

我不确定这是否与您尝试的完全相同,但data.table plyr包允许您运行由多个变量拆分的模型。下面是一个示例,var1var2只是代表两个变量,您希望每个值组合分别建模。

#load packages
library(data.table)
library(plyr)

#break up by variables, then fit the model to each piece
models <- dlply(data, c("var1","var2"),
              function(data)
                lm(DV ~ 
                     IV1 + IV2
                   , data = data, weights = weights))
#apply coef to eah model and return a df
models_coef <- ldply(models, coef)
#print summary
l_ply(models_coef, summary, .print = T)

答案 1 :(得分:0)

我就是这样做的。请注意,这是未经测试的,因为我没有安装relaimpo。我真的只是重新打包你的代码。

一般方法是 1.开发一个适用于一个组的功能 2.使用split将数据划分为多个组 3.使用lapply将功能应用于每个组 4.(如果需要)将结果合并在一起

对单站点数据子集的首次测试:

我所做的唯一更改是(a)为一个网站提取数据子集并将其命名为one_site。 (b)在建模代码中使用one_site。 (c)我更喜欢将一个公式作为字符串粘贴到使用substitute,所以我做了那个改变。 (d)空格和格式以便于阅读(主要使用RStudio的“重新格式化代码”)。

## set up
varlist <- names(d)[4:9]
library(relaimpo)
sumfun <- function(x) {
    c(
        coef(x),
        summary(x)$adj.r.squared,
        sqrt(mean(resid(x) ^ 2, na.rm = TRUE)),
        calc.relimp(x, type = "betasq")$betasq[1],
        calc.relimp(x, type = "betasq")$betasq[2],
        calc.relimp(x, type = "pratt")$pratt[1],
        calc.relimp(x, type = "pratt")$pratt[2]
    )
}

## Testing: this works for one_site
one_site <- subset(d, SiteName == "bp10")

models <- lapply(varlist, function(x) {  # apply the modeling function to our list of air variables
    form <- as.formula(sprintf("DMWT ~ DMAT + %s", x))
    lm(form, data = one_site)  # linear model with air variable substituted
})

## desired result
mod.df <- as.data.frame(t(sapply(models, sumfun)))

将其变为函数

一旦您拥有适用于单个网站的代码,我们就将其转换为一个功能。唯一的输入似乎是一个站点的数据和varlist中的变量。我们不是在底部分配结果,而是return

fit_one_site = function(one_site, varlist) {
    models <- lapply(varlist, function(x) {
            # apply the modeling function to our list of air variables
            form = as.formula(sprintf("DMWT ~ DMAT + %s", x))
            lm(form, data = one_site)  # linear model with air variable substituted
    })
    return(as.data.frame(t(sapply(models, sumfun))))
}

现在,我们可以使用splitSiteNamelapply分割您的数据,以便将fit_one_site函数应用于每个部分。

results = lapply(split(d, d$SiteName), FUN = fit_one_site, varlist = names(d)[4:9])

结果应该是数据框列表,每个站点一个。如果您想将它们合并为一个数据框,请参阅我的答案的相关部分at the list of data frames R-FAQ