Following is my dataframe
data <- data.frame(y = rep(1:10, times = 4), dataID = rep(1:4, each = 10),x1 = rnorm(40), x2 = rnorm(40), x3 = rnorm(40))
For each dataID and x combination, I am interested in calculating the Rsquared of linear regression between y and each x
variable <- c("x1", "x2", "x3", "x4")
for(v in seq_along(variable)){
varref <- variable[v]
temp <- data %>% dplyr::select(y, dataID, varref)
modID <- sort(unique(temp$dataID))
for(m in seq_along(modID)){
modRef <- modID[m]
tempMod <- temp %>% dplyr::filter(dataID == modRef) %>% dplyr::select(-dataID)
Rsq <- summary(lm(y ~ ., data = tempMod))$adj.r.squared
}
However what I really want to do is to regress the linear and non-linear term. So I am wondering if there is any way to refer to the non-linear term like this:
Rsq <- summary(lm(y ~ . + I(.^2), data = tempMod))$adj.r.squared
答案 0 :(得分:1)
您可以首先根据split
datID
来获取数据。
sl <- split(dat[-2], dat$datID)
然后将其堆叠以在列出的子集中获得此格式。
# values ind
# 1 -0.18558347 x1
# 2 0.26342365 x1
# 3 2.24365427 x1
# ....
l <- lapply(sl, function(x) {
st <- stack(x, 2:4)
return(cbind(y=x$y, st))
})
第三步,对每个by
列使用x*
分别进行回归,这将为您提供所需的 adj。 r-平方矩阵,其中datID
作为列,x*
作为行。
sapply(l, function(x)
by(x, x$ind, function(i)
summary(lm(y ~ values, i))$adj.r.squared))
# 1 2 3 4
# x1 -0.02913955 -0.07963248 -0.002224305 0.07005255
# x2 -0.11442395 0.13642959 -0.116281328 0.21777396
# x3 0.15864022 -0.11180017 0.110556373 0.18661920
您可以轻松地验证结果。
d1 <- dat[dat$datID == 1, c("y", "x1")]
summary(lm(y ~ x1, d1))$adj.r.squared
# [1] -0.02913955
正如人们所看到的,结果应该与矩阵的第一个单元格相同。
根据多项式:
sapply(l, function(x)
by(x, x$ind, function(i)
summary(lm(y ~ values + I(values^2), i))$adj.r.squared))
# 1 2 3 4
# x1 -0.09385569 -0.15431438 -0.13778041 0.61666182
# x2 -0.12810364 0.06199001 -0.17033677 0.16826139
# x3 0.32189185 -0.26710528 0.04131524 0.07438912
数据
dat <- structure(list(y = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L),
datID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), x1 = c(-0.185583474894103,
0.263423651143744, 2.2436542652316, -1.13150593879308, -1.03459059777182,
-1.0188653780855, 1.77189001207599, -0.934738205360317, -0.666096892490933,
-0.826042921788607, -0.272687984798706, -1.09852037093336,
0.46071342230167, 0.268619377702867, -0.328748146462296,
1.53440197834959, -0.220853326721626, -0.808382808810487,
-1.66645442449068, -0.242353150310531, 1.26231491749131,
0.162668675840026, 0.904121249800934, 0.182921809090968,
0.0976287057955422, -0.85694452908352, 0.351871098571782,
1.0762092522706, -1.11524095896311, 0.656514413311347, -0.384391504028304,
0.528952365385126, -0.151039712939782, -1.21953511523724,
0.976270995535166, -1.04402181901669, -1.37102481436038,
1.42117000396589, 0.912663580389998, 1.4754732274035), x2 = c(0.0402247269190048,
1.33038146656669, -0.459598594357653, -0.856969098432259,
0.228451646834162, -0.143028413102539, -1.70899900410409,
0.507370783610232, 1.56538440117379, -0.951401815451773,
-0.393071689633881, -0.297290136122118, 0.828629815124767,
0.515465904120197, 0.190957567662703, 1.31797724653282, 0.747333126551401,
1.80770651506827, -0.0692143782238828, 0.733652510844958,
0.513558767430303, -0.20007253887957, -0.96235732491512,
0.0696221474571983, 0.420165653583179, 0.575500351812627,
-3.10456905099436, -1.12954301022196, -0.0133568751505841,
0.872703653757334, -2.16354318496144, -0.040885605623714,
-2.23800995886235, 0.0875803347755469, 0.334286223450742,
1.14624718629833, 1.73296419775862, 0.630121428827583, -0.0854394504507166,
-0.119157739756379), x3 = c(0.761736068860907, -0.407127443123656,
0.35298681720387, -0.618056268737422, 0.990213082637937,
1.3715711543413, -0.389277775804358, -0.332620507679104,
-0.705017421192073, -1.38091039816317, -0.190763711175575,
-0.489473208003468, -0.637223536105616, 3.45385530080228,
-0.159752883157984, 1.51368282996416, -0.128118214564304,
1.11468646349289, -1.79708316700774, -0.0857055812374004,
0.496044729769967, -1.87377039638354, -1.36550978490534,
0.457670667628235, -1.53146832089459, 2.04124827349235, 1.07993603816386,
0.0927402664330579, 1.02713895045668, 0.359269719833989,
0.886254022672678, -0.0513908691703132, 0.0652030406719866,
-0.744441295151328, 0.988314721587972, -0.275845245520699,
0.611272789477384, -1.56892502735795, -0.480009166030005,
-0.755614369802782)), class = "data.frame", row.names = c(NA,
-40L))
答案 1 :(得分:1)
考虑构建x
和x^2
术语的字符串公式:
variable <- c("x1", "x2", "x3", "x4")
formulas_vec <- paste0("y ~ ", variable, " + ", variable, "^2")
formulas_vec
# [1] "y ~ x1 + x1^2" "y ~ x2 + x2^2" "y ~ x3 + x3^2" "y ~ x4 + x4^2"
然后用sapply
迭代运行此字符向量,以调用lm
内的by
公式(将数据帧子集化一个或多个因子并传递给方法的函数) :
by_list <- by(data, data$dataID, function(sub)
sapply(formulas_vec, function(f) summary(lm(as.formula(f), data = sub))$adj.r.squared)
)
by_list
# data$dataID: 1
# y ~ x1 + x1^2 y ~ x2 + x2^2 y ~ x3 + x3^2 y ~ x4 + x4^2
# 0.09630843 -0.10987047 0.19007264 -0.01738122
# ------------------------------------------------------------
# data$dataID: 2
# y ~ x1 + x1^2 y ~ x2 + x2^2 y ~ x3 + x3^2 y ~ x4 + x4^2
# -0.04974500 -0.08639353 -0.10195335 0.02649648
# ------------------------------------------------------------
# data$dataID: 3
# y ~ x1 + x1^2 y ~ x2 + x2^2 y ~ x3 + x3^2 y ~ x4 + x4^2
# 0.08430858 0.41739019 -0.02605365 -0.11197322
# ------------------------------------------------------------
# data$dataID: 4
# y ~ x1 + x1^2 y ~ x2 + x2^2 y ~ x3 + x3^2 y ~ x4 + x4^2
# -0.12424060 -0.11589655 -0.12129724 -0.08263092
adj_r_matrix <- do.call(rbind, by_list)
adj_r_matrix
# y ~ x1 + x1^2 y ~ x2 + x2^2 y ~ x3 + x3^2 y ~ x4 + x4^2
# 1 0.09630843 -0.10987047 0.19007264 -0.01738122
# 2 -0.04974500 -0.08639353 -0.10195335 0.02649648
# 3 0.08430858 0.41739019 -0.02605365 -0.11197322
# 4 -0.12424060 -0.11589655 -0.12129724 -0.08263092