我正在尝试使用具有4列(均是因变量)的数据框和具有194行和212列的数据框进行简单的线性回归分析。 我还有5个其他数据框用作相同分析的因变量
我已经达到了预期的结果,但是我需要向外扩展,我试图添加一个额外的for循环(用于因变量的列),但是我还需要同时创建更多的空列表。
我想知道我将如何实现这一目标?
我当前的for循环是:
y <- data.frame(Green_Class_Commercial[,-1])
x <- data.frame(lagvar[1:175,c(-1,-2)])
out <- data.frame(NULL) # create object to keep results
for (i in 1:length(x)) {
m <- summary(lm(y[,1] ~ x[,i])) # run model
out[i, 1] <- names(x)[i] # print variable name
out[i, 2] <- m$coefficients[1,1] # intercept
out[i, 3] <- m$coefficients[2,1] # coefficient
out[i, 4] <-m$coefficients[2,4] # Pvalue
out[i,5] <-m$r.squared # R-squared
}
names(out) <- c("Variable", "Intercept", "Coefficient","P-val","R-square")
head(out)
提供输出
> head(out)
Variable Intercept Coefficient P-val R-square
1 GDP.SC 0.2540527 -4.722220e-07 0.7032087 8.411229e-04
2 GDP.SC1 0.1148311 3.107631e-07 0.7959237 3.899366e-04
3 GDP.SC2 0.1609010 4.998762e-08 0.9673014 9.855831e-06
4 GDP.SC3 0.1353608 1.959274e-07 0.8746321 1.468544e-04
5 GDP.SC4 0.1439931 1.487237e-07 0.9064221 8.200597e-05
6 CivilianLaborForce.SC 0.2595231 -4.078450e-08 0.7716514 4.881398e-04
>
所以这是我要运行回归的变量
#The x Variable
structure(list(GDP.SC = c(154698, 154698, 154698, 154698, 154698,
154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4,
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
), GDP.SC1 = c(NA, 154698, 154698, 154698, 154698, 154698, 154698,
154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4,
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4),
GDP.SC2 = c(NA, NA, 154698, 154698, 154698, 154698, 154698,
154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4,
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
), GDP.SC3 = c(NA, NA, NA, 154698, 154698, 154698, 154698,
154698, 154698, 154698, 154698, 154698, 154698, 160138.4,
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
), GDP.SC4 = c(NA, NA, NA, NA, 154698, 154698, 154698, 154698,
154698, 154698, 154698, 154698, 154698, 154698, 160138.4,
160138.4, 160138.4, 160138.4, 160138.4, 160138.4)), row.names = c(NA,
20L), class = "data.frame")
#The Y Variable
structure(list(X = 1:20, ComBus = c(0.83, 0, 0.23, 0.09, 0.1,
0.11, 0.15, 0.18, 0.37, 0.19, 0, 0.18, 0.09, 0.1, 0.03, 0.5,
0.14, 0.17, 0.11, 0.06), ComCon = c(NA, 0, 0, 0, 0, 0.5, 0, 0,
NA, 0.67, 0, 0, 0, 0, 0.5, 0, 0, NA, 1, 0), ComNoo = c(0.25,
0.14, 0.38, 0.17, 0.14, 0.33, 0.44, 0.05, 0.04, 0.1, 0.18, 0.06,
0.23, 0.14, 0.5, 0.14, 0.5, 0, 0.14, 0.23), ComOO = c(0, 0, 0,
0, 0, 0.33, 0, 0, 0, 0.18, 0.22, 0.15, 0, 0, 0.17, 0, 0, 0, 0,
0)), row.names = c(NA, 20L), class = "data.frame")
答案 0 :(得分:1)
好,这对您有好处吗?如果可以的话,我将循环替换为apply吗?
### Some dummy dataframes
x <- data.frame(v1 = rnorm(1:10),
v2 = rnorm(1:10),
v3 = runif(10, 1, 1000),
v4 = runif(10, 1, 1000))
x2 <- data.frame(v1 = rnorm(1:10),
v2 = rnorm(1:10),
v3 = runif(10, 1, 1000),
v4 = runif(10, 1, 1000))
y <- data.frame(v1 = rnorm(1:10),
v2 = rnorm(1:10),
v3 = runif(10, 1, 1000),
v4 = runif(10, 1, 1000))
y2 <- data.frame(v1 = rnorm(1:10),
v2 = rnorm(1:10),
v3 = runif(10, 1, 1000),
v4 = runif(10, 1, 1000))
###
# I tend to prefer the apply family of functions to replace loops where possible.
# This function takes two inputs, dataframes of dependent and independent variables.
# the apply function here takes the x_df and applies the following anonymous function to each column
# so for each column in x_df it performs a lm against the first column of y_df
lm_func <- function(y_df, x_df) {
out <- apply(x_df, MARGIN = 2, function(x) {
lm(y_df[, 1] ~ x)
})
return(out)
}
results_list <- lm_func(y, x)
# the output is one list element per lm. I like to keep the whole lm output just in case you need to go back to it
# we can then turn that list back into a dataframe using rbindlist from data.table
# and get what I think is your desired output using glance from broom
library(data.table)
library(broom)
results_glance <- rbindlist(lapply(results_list, glance), idcol = "var_name")
# or keep it as a list if you wish
results_list_glance <- lapply(results_list, glance)
# to run the function using a single x argument but multiple y arguments you can use mapply
results_list_m <- mapply(lm_func,
y_df = list(y, y2),
MoreArgs = list( # other arguments you want to keep fixed
x_df = x
),
SIMPLIFY = F
)
# the output is a little fiendish because it will be a list of lists
# we can include the rbindlist and glance into the function to make the output a little simpler:
lm_func_bind <- function(y_df, x_df) {
out <- apply(x_df, MARGIN = 2, function(x) {
lm(y_df[, 1] ~ x)
})
out <- rbindlist(lapply(out, glance), idcol = "var_name")
return(out)
}
results_glance_df <- lm_func_bind(y, x)
results_list_dfs <- mapply(lm_func_bind,
y_df = list(y, y2),
MoreArgs = list( # other arguments you want to keep fixed
x_df = x
),
SIMPLIFY = F
)
请让我知道是否可以做得更好。如果您不熟悉apply
和rbindlist
之类的某些功能,则值得查阅其文档。
干杯!
P.S。通常,由于成功的机会,重复的线性模型并不理想。但这更多是统计问题,而不是编码问题!
答案 1 :(得分:1)
考虑一个嵌套的lapply
,外部调用遍历因变量数据框架的每一列,每次内部调用遍历独立变量数据框架的所有列:
reg_data <- function(yvar, xdf) {
# ITERATE THROUGH EACH COLUMN OF x
df_list <- lapply(seq_along(xdf), function(i) {
m <- summary(lm(yvar ~ x[,i])) # run model
data.frame(
Variable = names(x)[i], # print variable name
Intercept = m$coefficients[1,1], # intercept
Coefficient = m$coefficients[2,1], # coefficient
P_val = m$coefficients[2,4], # P-value
R_square = m$r.squared # R-squared
)
})
return(do.call(rbind, df_list))
}
# ITERATE THROUGH EACH COLUMN OF y
model_dfs <- lapply(y[-1], function(col) reg_data(col, x))
输出
model_dfs
# $ComBus
# Variable Intercept Coefficient P_val R_square
# 1 GDP.SC 2.6988486 -1.599147e-05 0.3262406 0.053555129
# 2 GDP.SC1 -0.1802638 2.083180e-06 0.8452577 0.002304901
# 3 GDP.SC2 0.4443504 -1.838100e-06 0.8656578 0.001843828
# 4 GDP.SC3 -0.2114691 2.310754e-06 0.8410848 0.002767098
# 5 GDP.SC4 -0.4596142 3.921280e-06 0.7517165 0.007381776
# $ComCon
# Variable Intercept Coefficient P_val R_square
# 1 GDP.SC -0.4342988 3.752788e-06 0.8970060 0.001154220
# 2 GDP.SC1 -1.5050149 1.056908e-05 0.7148913 0.009154924
# 3 GDP.SC2 -2.2666678 1.549256e-05 0.6144502 0.018606737
# 4 GDP.SC3 -3.2822050 2.205720e-05 0.5032585 0.035178198
# 5 GDP.SC4 -4.7039571 3.124770e-05 0.3808557 0.064522691
# $ComNoo
# Variable Intercept Coefficient P_val R_square
# 1 GDP.SC -0.02348033 1.470480e-06 0.9087818 0.000749555
# 2 GDP.SC1 -0.33062799 3.410697e-06 0.8011075 0.003836926
# 3 GDP.SC2 -1.11901191 8.455261e-06 0.5536610 0.022365205
# 4 GDP.SC3 -1.58084828 1.134370e-05 0.4400999 0.040243587
# 5 GDP.SC4 -2.12276002 1.482734e-05 0.3493362 0.062765524
# $ComOO
# Variable Intercept Coefficient P_val R_square
# 1 GDP.SC -0.03430512 5.514300e-07 0.9481025 0.000241968
# 2 GDP.SC1 1.13773433 -6.882664e-06 0.4347716 0.036277535
# 3 GDP.SC2 1.98603902 -1.226932e-05 0.1785540 0.110105644
# 4 GDP.SC3 1.89971836 -1.171132e-05 0.2291494 0.094842038
# 5 GDP.SC4 1.78462415 -1.096733e-05 0.2963624 0.077531366