从lm汇总系数

时间:2018-12-26 14:14:00

标签: r extract information-extraction tabulate

我有10个线性模型,我只需要一些信息,即:r平方,p值,斜率和截距系数。我设法提取了这些值(通过荒谬地重复代码)。现在,我需要将这些值制成表格(列中的信息;行列出了线性模型1-10的结果)。谁能帮帮我吗?我还有数百个线性模型要做。我确定一定有办法。

Data file hosted here

代码:

d<-read.csv("example.csv",header=T)

# Subset data
A3G1 <- subset(d, CatChro=="A3G1"); A4G1 <- subset(d, CatChro=="A4G1")
A3G2 <- subset(d, CatChro=="A3G2"); A4G2 <- subset(d, CatChro=="A4G2")
A3G3 <- subset(d, CatChro=="A3G3"); A4G3 <- subset(d, CatChro=="A4G3")
A3G4 <- subset(d, CatChro=="A3G4"); A4G4 <- subset(d, CatChro=="A4G4")
A3G5 <- subset(d, CatChro=="A3G5"); A4G5 <- subset(d, CatChro=="A4G5")
A3D1 <- subset(d, CatChro=="A3D1"); A4D1 <- subset(d, CatChro=="A4D1")
A3D2 <- subset(d, CatChro=="A3D2"); A4D2 <- subset(d, CatChro=="A4D2")
A3D3 <- subset(d, CatChro=="A3D3"); A4D3 <- subset(d, CatChro=="A4D3")
A3D4 <- subset(d, CatChro=="A3D4"); A4D4 <- subset(d, CatChro=="A4D4")
A3D5 <- subset(d, CatChro=="A3D5"); A4D5 <- subset(d, CatChro=="A4D5")

# Fit individual lines
rA3G1 <- lm(Qend~Rainfall, data=A3G1); summary(rA3G1)
rA3D1 <- lm(Qend~Rainfall, data=A3D1); summary(rA3D1)
rA3G2 <- lm(Qend~Rainfall, data=A3G2); summary(rA3G2)
rA3D2 <- lm(Qend~Rainfall, data=A3D2); summary(rA3D2)
rA3G3 <- lm(Qend~Rainfall, data=A3G3); summary(rA3G3)
rA3D3 <- lm(Qend~Rainfall, data=A3D3); summary(rA3D3)
rA3G4 <- lm(Qend~Rainfall, data=A3G4); summary(rA3G4)
rA3D4 <- lm(Qend~Rainfall, data=A3D4); summary(rA3D4)
rA3G5 <- lm(Qend~Rainfall, data=A3G5); summary(rA3G5)
rA3D5 <- lm(Qend~Rainfall, data=A3D5); summary(rA3D5)

rA4G1   <- lm(Qend~Rainfall, data=A4G1); summary(rA4G1)
rA4D1   <- lm(Qend~Rainfall, data=A4D1); summary(rA4D1)
rA4G2   <- lm(Qend~Rainfall, data=A4G2); summary(rA4G2)
rA4D2   <- lm(Qend~Rainfall, data=A4D2); summary(rA4D2)
rA4G3   <- lm(Qend~Rainfall, data=A4G3); summary(rA4G3)
rA4D3   <- lm(Qend~Rainfall, data=A4D3); summary(rA4D3)
rA4G4   <- lm(Qend~Rainfall, data=A4G4); summary(rA4G4)
rA4D4   <- lm(Qend~Rainfall, data=A4D4); summary(rA4D4)
rA4G5   <- lm(Qend~Rainfall, data=A4G5); summary(rA4G5)
rA4D5   <- lm(Qend~Rainfall, data=A4D5); summary(rA4D5)

# Gradient
summary(rA3G1)$coefficients[2,1]
summary(rA3D1)$coefficients[2,1]
summary(rA3G2)$coefficients[2,1]
summary(rA3D2)$coefficients[2,1] 
summary(rA3G3)$coefficients[2,1] 
summary(rA3D3)$coefficients[2,1] 
summary(rA3G4)$coefficients[2,1] 
summary(rA3D4)$coefficients[2,1] 
summary(rA3G5)$coefficients[2,1] 
summary(rA3D5)$coefficients[2,1]

# Intercept
summary(rA3G1)$coefficients[2,2] 
summary(rA3D1)$coefficients[2,2] 
summary(rA3G2)$coefficients[2,2] 
summary(rA3D2)$coefficients[2,2] 
summary(rA3G3)$coefficients[2,2] 
summary(rA3D3)$coefficients[2,2] 
summary(rA3G4)$coefficients[2,2] 
summary(rA3D4)$coefficients[2,2] 
summary(rA3G5)$coefficients[2,2] 
summary(rA3D5)$coefficients[2,2] 

# r-sq
summary(rA3G1)$r.squared
summary(rA3D1)$r.squared
summary(rA3G2)$r.squared
summary(rA3D2)$r.squared
summary(rA3G3)$r.squared
summary(rA3D3)$r.squared
summary(rA3G4)$r.squared
summary(rA3D4)$r.squared
summary(rA3G5)$r.squared
summary(rA3D5)$r.squared

# adj r-sq
summary(rA3G1)$adj.r.squared
summary(rA3D1)$adj.r.squared
summary(rA3G2)$adj.r.squared
summary(rA3D2)$adj.r.squared
summary(rA3G3)$adj.r.squared
summary(rA3D3)$adj.r.squared
summary(rA3G4)$adj.r.squared
summary(rA3D4)$adj.r.squared
summary(rA3G5)$adj.r.squared
summary(rA3D5)$adj.r.squared

# p-level
p <- summary(rA3G1)$fstatistic
pf(p[1], p[2], p[3], lower.tail=FALSE) 
p2 <- summary(rA3D1)$fstatistic
pf(p2[1], p2[2], p2[3], lower.tail=FALSE) 
p3 <- summary(rA3G2)$fstatistic
pf(p3[1], p3[2], p3[3], lower.tail=FALSE) 
p4 <- summary(rA3D2)$fstatistic
pf(p4[1], p4[2], p4[3], lower.tail=FALSE) 
p5 <- summary(rA3G3)$fstatistic
pf(p5[1], p5[2], p5[3], lower.tail=FALSE) 
p6 <- summary(rA3D3)$fstatistic
pf(p6[1], p6[2], p6[3], lower.tail=FALSE) 
p7 <- summary(rA3G4)$fstatistic
pf(p7[1], p7[2], p7[3], lower.tail=FALSE) 
p8 <- summary(rA3D4)$fstatistic
pf(p8[1], p8[2], p8[3], lower.tail=FALSE) 
p9 <- summary(rA3G5)$fstatistic
pf(p9[1], p9[2], p9[3], lower.tail=FALSE) 
p10 <- summary(rA3D5)$fstatistic
pf(p10[1], p10[2], p10[3], lower.tail=FALSE) 

这是我预期结果的结构:

Expected outcome

有什么办法可以做到这一点?

5 个答案:

答案 0 :(得分:1)

您可以使用library(data.table)

d <- fread("example.csv")
d[, .(
  r2         = (fit <- summary(lm(Qend~Rainfall)))$r.squared,
  adj.r2     = fit$adj.r.squared,
  intercept  = fit$coefficients[1,1], 
  gradient   = fit$coefficients[2,1],
  p.value    = {p <- fit$fstatistic; pf(p[1], p[2], p[3], lower.tail=FALSE)}),
  by  = CatChro]

#    CatChro         r2       adj.r2   intercept     gradient      p.value
# 1:    A3G1 0.03627553  0.011564648 0.024432020 0.0001147645 0.2329519751
# 2:    A3D1 0.28069553  0.254054622 0.011876543 0.0004085644 0.0031181110
# 3:    A3G2 0.06449971  0.041112205 0.026079409 0.0004583538 0.1045970987
# 4:    A3D2 0.03384173  0.005425311 0.023601325 0.0005431693 0.2828170556
# 5:    A3G3 0.07587433  0.054383038 0.043537869 0.0006964512 0.0670399684
# 6:    A3D3 0.04285322  0.002972105 0.022106960 0.0001451185 0.3102578215
# 7:    A3G4 0.17337420  0.155404076 0.023706652 0.0006442175 0.0032431299
# 8:    A3D4 0.37219027  0.349768492 0.009301843 0.0006614213 0.0003442445
# 9:    A3G5 0.17227383  0.150491566 0.025994831 0.0006658466 0.0077413595
#10:    A3D5 0.04411669 -0.008987936 0.014341399 0.0001084626 0.3741011769

答案 1 :(得分:1)

考虑构建lm结果的矩阵。首先创建一个定义的函数,以处理带有结果提取的通用数据框架模型。然后,调用by,它可以按因子列将数据帧作为子集并将每个子集传递到已定义的方法中。最后,rbind将所有分组的矩阵放在一起以获得单个输出

lm_results <- function(df) {

  model <- lm(Qend ~ Rainfall, data = df)
  res <- summary(model)

  p <- res$fstatistic

  c(gradient = res$coefficients[2,1],
    intercept = res$coefficients[2,2],
    r_sq = res$r.squared,
    adj_r_sq = res$adj.r.squared,
    f_stat = p[['value']],
    p_value = unname(pf(p[1], p[2], p[3], lower.tail=FALSE))
  )
}

matrix_list <- by(d, d$group, lm_results)

final_matrix <- do.call(rbind, matrix_list)

演示随机种子数据

set.seed(12262018)
data_tools <- c("sas", "stata", "spss", "python", "r", "julia")

d <- data.frame(
  group = sample(data_tools, 500, replace=TRUE),
  int = sample(1:15, 500, replace=TRUE),
  Qend = rnorm(500) / 100,
  Rainfall = rnorm(500) * 10
)

结果

mat_list <- by(d, d$group, lm_results)

final_matrix <- do.call(rbind, mat_list)
final_matrix

#             gradient    intercept        r_sq     adj_r_sq    f_stat    p_value
# julia  -1.407313e-04 1.203832e-04 0.017219149  0.004619395 1.3666258 0.24595273
# python -1.438116e-04 1.125170e-04 0.018641512  0.007230367 1.6336233 0.20464162
# r       2.031717e-04 1.168037e-04 0.041432175  0.027738349 3.0256098 0.08635510
# sas    -1.549510e-04 9.067337e-05 0.032476668  0.021355710 2.9203121 0.09103619
# spss    9.326656e-05 1.068516e-04 0.008583473 -0.002682623 0.7618853 0.38511469
# stata  -7.079514e-05 1.024010e-04 0.006013841 -0.006568262 0.4779679 0.49137093

答案 2 :(得分:1)

这是基本的R解决方案:

data <- read.csv("./data/so53933238.csv",header=TRUE)

# split by value of CatChro into a list of datasets
dataList <- split(data,data$CatChro)

# process the list with lm(), extract results to a data frame, write to a list
lmResults <- lapply(dataList,function(x){
     y <- summary(lm(Qend ~ Rainfall,data = x))
     Intercept <- y$coefficients[1,1]
     Slope <- y$coefficients[2,1]
     rSquared <- y$r.squared
     adjRSquared <- y$adj.r.squared
     f <- y$fstatistic[1]
     pValue <- pf(y$fstatistic[1],y$fstatistic[2],y$fstatistic[3],lower.tail = FALSE)
     data.frame(Slope,Intercept,rSquared,adjRSquared,pValue)
})
lmResultTable <- do.call(rbind,lmResults)
# add CatChro indicators
lmResultTable$catChro <- names(dataList)

lmResultTable 

...以及输出:

    > lmResultTable
            Slope   Intercept   rSquared  adjRSquared       pValue catChro
A3D1 0.0004085644 0.011876543 0.28069553  0.254054622 0.0031181110    A3D1
A3D2 0.0005431693 0.023601325 0.03384173  0.005425311 0.2828170556    A3D2
A3D3 0.0001451185 0.022106960 0.04285322  0.002972105 0.3102578215    A3D3
A3D4 0.0006614213 0.009301843 0.37219027  0.349768492 0.0003442445    A3D4
A3D5 0.0001084626 0.014341399 0.04411669 -0.008987936 0.3741011769    A3D5
A3G1 0.0001147645 0.024432020 0.03627553  0.011564648 0.2329519751    A3G1
A3G2 0.0004583538 0.026079409 0.06449971  0.041112205 0.1045970987    A3G2
A3G3 0.0006964512 0.043537869 0.07587433  0.054383038 0.0670399684    A3G3
A3G4 0.0006442175 0.023706652 0.17337420  0.155404076 0.0032431299    A3G4
A3G5 0.0006658466 0.025994831 0.17227383  0.150491566 0.0077413595    A3G5
>

要以HTML表格格式呈现输出,可以使用knitr::kable()

library(knitr)
kable(lmResultTable[1:5],row.names=TRUE,digits=5) 

...在呈现Markdown之后会产生以下输出:

enter image description here

答案 3 :(得分:1)

这里只有几行:

library(tidyverse)
library(broom)
# create grouped dataframe:
df_g <- df %>% group_by(CatChro)
df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>% 
   select(CatChro, term, estimate) %>% spread(term, estimate) %>% 
   left_join(df_g %>% do(glance(lm(Qend ~ Rainfall, data = .))) %>%
   select(CatChro, r.squared, adj.r.squared, p.value), by = "CatChro")

结果将是:

# A tibble: 10 x 6
# Groups:   CatChro [?]
   CatChro `(Intercept)` Rainfall r.squared adj.r.squared  p.value
   <chr>           <dbl>    <dbl>     <dbl>         <dbl>    <dbl>
 1 A3D1          0.0119  0.000409    0.281        0.254   0.00312 
 2 A3D2          0.0236  0.000543    0.0338       0.00543 0.283   
 3 A3D3          0.0221  0.000145    0.0429       0.00297 0.310   
 4 A3D4          0.00930 0.000661    0.372        0.350   0.000344
 5 A3D5          0.0143  0.000108    0.0441      -0.00899 0.374   
 6 A3G1          0.0244  0.000115    0.0363       0.0116  0.233   
 7 A3G2          0.0261  0.000458    0.0645       0.0411  0.105   
 8 A3G3          0.0435  0.000696    0.0759       0.0544  0.0670  
 9 A3G4          0.0237  0.000644    0.173        0.155   0.00324 
10 A3G5          0.0260  0.000666    0.172        0.150   0.00774 

那么,这是怎么工作的?

下面的代码创建一个具有所有系数和相应统计信息的数据帧(整洁地将lm的结果转换为数据帧):

df_g %>%
  do(tidy(lm(Qend ~ Rainfall, data = .)))
A tibble: 20 x 6
Groups:   CatChro [10]
   CatChro term        estimate std.error statistic      p.value
   <chr>   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
 1 A3D1    (Intercept) 0.0119   0.00358       3.32  0.00258     
 2 A3D1    Rainfall    0.000409 0.000126      3.25  0.00312     
 3 A3D2    (Intercept) 0.0236   0.00928       2.54  0.0157      
 4 A3D2    Rainfall    0.000543 0.000498      1.09  0.283       

我知道您想将截距和降雨的系数作为单独的列,所以让我们“分布”它们。这是通过首先选择相关列,然后调用tidyr::spread来实现的,如

select(CatChro, term, estimate) %>% spread(term, estimate)

这给您:

df_g %>% do(tidy(lm(Qend ~ Rainfall, data = .))) %>% 
  select(CatChro, term, estimate) %>% spread(term, estimate)
A tibble: 10 x 3
Groups:   CatChro [10]
   CatChro `(Intercept)` Rainfall
   <chr>           <dbl>    <dbl>
 1 A3D1          0.0119  0.000409
 2 A3D2          0.0236  0.000543
 3 A3D3          0.0221  0.000145
 4 A3D4          0.00930 0.000661

Glance为您提供了每个模型所需的摘要统计信息。这些模型按组(在这里是CatChro)进行索引,因此很容易将它们合并到先前的数据帧中,这就是其余代码的含义。

答案 4 :(得分:1)

另一个解决方案,使用lme4::lmList。由summary()生成的对象的lmList方法几乎可以完成您想要的所有操作(尽管它不存储p值,但我必须在下面添加一些内容)。

m <- lme4::lmList(Qend~Rainfall|CatChro,data=d)
s <- summary(m)
pvals <- apply(s$fstatistic,1,function(x) pf(x[1],x[2],x[3],lower.tail=FALSE))
data.frame(intercept=coef(s)[,"Estimate","(Intercept)"],
           slope=coef(s)[,"Estimate","Rainfall"],
           r.squared=s$r.squared,
           adj.r.squared=unlist(s$adj.r.squared),
           p.value=pvals)