使用for循环仅获取选定变量的相关性

时间:2019-06-19 15:29:48

标签: r for-loop indexing data.table correlation

我有一个数据集,如下所示:

set.seed(1)
TDT <- data.table(Group = c(rep("A",40),rep("B",60)),
                      Id = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
                      Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
                      norm = round(runif(100)/10,2),
                      y = sample(100,100),
                      x2 = round(rnorm(100,0.75,0.3),2),
                      x3 = round(rnorm(100,0.75,0.3),2),
                      x4 = round(rnorm(100,0.75,0.3),2),
                      x5 = round(rnorm(100,0.75,0.3),2))

我想用我选择的TDT的相关性来创建一个新的数据集。 为了获得因变量和选定的自变量(xcor)与所有其他自变量的相关性,我尝试过:

# getting the numerical variables
numcols <- names(Filter(is.numeric,TDT))

# getting the correlation of y with all x
TDT.y.cor<- TDT[, cor(.SD, y),.SDcols = numcols, by = Time]

# getting the correlation of selected x with all other x
xcor <- c("x2","x3")
TDT.x.cor <- TDT[, cor(.SD, xcor),.SDcols = numcols, by = Time]

但是最后一个给出了错误:

Error in cor(.SD, xcor) : 
  'y' must be numeric

所以现在我想做一个for循环。像这样:

xcor <- list ("x2","x3")
for (i in xcor) {
TDT.xcor[[i]].cor <- TDT[, cor(.SD, xcor[[i]]),.SDcols = numcols, by = Time]
}

这不是正确的语法。有什么建议吗?

2 个答案:

答案 0 :(得分:2)

免责声明。此答案使用了我编写的manymodelr的开发人员版本。

要获取所有xs和y之间的相关性,我们可以使用:

manymodelr::get_var_corr(Filter(is.numeric,TDT),
                         comparison_var = "y",get_all = TRUE)

这将产生:

   Comparison_Var Other_Var      p_value Correlation    lower_ci  upper_ci
1              y        Id 0.7660224000  0.03013023 -0.16727786 0.2252155
2              y      norm 0.5323952847  0.06316715 -0.13492501 0.2564040
3              y        x2 0.7032924284 -0.03855859 -0.23321046 0.1590642
4              y        x3 0.4898005278  0.06985644 -0.12832239 0.2626704
5              y        x4 0.2702013263  0.11131634 -0.08700408 0.3011500
6              y        x5 0.0009658689  0.32510267  0.13746236 0.4902174

仅对选择变量(即x和几个xs)进行操作:

manymodelr::get_var_corr(Filter(is.numeric,TDT),
                          comparison_var = "x2",get_all = FALSE,other_vars = c("x4","x5"))
  Comparison_Var Other_Var   p.value Correlation    lower_ci  upper_ci
1             x2        x4 0.5593246  -0.0590762 -0.25256366 0.1389543
2             x2        x5 0.1787881   0.1355323 -0.06255102 0.3233427

您可以保存结果并获取其他统计信息,例如摘要输出。

编辑: 要使用特定变量,可以使用变体get_var_corr_,它支持以下组合(第一列可以舍弃):

get_var_corr_(filter_df) %>% 
   filter(Comparison_Var %in% c("x2","y") & Other_Var %in% c("x4","x3"))
   .id Comparison_Var Other_Var   p.value Correlation    lower_ci  upper_ci
1 Var2              y        x3 0.4898005  0.06985644 -0.12832239 0.2626704
2 Var2             x2        x3 0.6239815 -0.04961512 -0.24365870 0.1482477
3 Var2              y        x4 0.2702013  0.11131634 -0.08700408 0.3011500
4 Var2             x2        x4 0.5593246 -0.05907620 -0.25256366 0.1389543

答案 1 :(得分:1)

另一个选择:

numcols <- names(Filter(is.numeric,TDT))
xcor <- c("x2", "x3")
TDT[, data.table(var=xcor, t(cor(.SD[, mget(numcols)], .SD[, mget(xcor)]))), by=Time]

输出:

          Time var          Id        norm           y           x2           x3          x4           x5
 1: 2010-01-02  x2 -0.04843595  0.26582680 -0.66584960  1.000000000 -0.061024243 -0.69292534  0.194408505
 2: 2010-01-02  x3  0.39631671 -0.26906428 -0.58788152 -0.061024243  1.000000000 -0.54623949 -0.787149320
 3: 2010-02-02  x2  0.08165416  0.26828706 -0.10444724  1.000000000 -0.120104310  0.08966978 -0.687626977
 4: 2010-02-02  x3 -0.77420649  0.06331042 -0.62424401 -0.120104310  1.000000000 -0.92782037  0.520999829
 5: 2010-03-02  x2  0.53328988 -0.76471756  0.09583857  1.000000000 -0.255684070 -0.57216005  0.583055924
 6: 2010-03-02  x3 -0.03208419 -0.40639968 -0.93857812 -0.255684070  1.000000000  0.21778224 -0.690868245
 7: 2010-04-02  x2 -0.40027209  0.12681443 -0.04596013  1.000000000  0.817854430 -0.31396988  0.188876433
 8: 2010-04-02  x3  0.10876596 -0.23779053 -0.06729634  0.817854430  1.000000000 -0.12087380  0.162171044
 9: 2010-05-02  x2  0.47014025  0.39570025 -0.10324192  1.000000000 -0.353894786  0.62548822  0.861633507
10: 2010-05-02  x3  0.60481815  0.64806521  0.64669485 -0.353894786  1.000000000 -0.76302787 -0.032024563
11: 2010-06-02  x2 -0.29866753 -0.03715892  0.02678853  1.000000000  0.722765758  0.85305747  0.215180135
12: 2010-06-02  x3 -0.71939635  0.18631833  0.20915545  0.722765758  1.000000000  0.90883928  0.338190647
13: 2010-07-02  x2  0.19959128 -0.06868888 -0.38153376  1.000000000 -0.205761177 -0.13574954  0.106505491
14: 2010-07-02  x3 -0.82150037  0.53153382 -0.56734304 -0.205761177  1.000000000  0.23196740  0.254086025
15: 2010-08-02  x2 -0.84788586  0.06205274  0.04195958  1.000000000  0.290333143 -0.17452641 -0.387409233
16: 2010-08-02  x3  0.17476552 -0.39239246  0.07572605  0.290333143  1.000000000  0.12847199 -0.417170776
17: 2010-09-02  x2 -0.28424279 -0.35470966  0.36559401  1.000000000 -0.004094751 -0.60601077  0.429553721
18: 2010-09-02  x3  0.17492786  0.12234080 -0.68318074 -0.004094751  1.000000000 -0.71591012 -0.867131063
19: 2010-10-02  x2 -0.59009254 -0.30596460 -0.20279720  1.000000000 -0.418992781 -0.33474959  0.331402645
20: 2010-10-02  x3 -0.03950447 -0.57650894  0.47707048 -0.418992781  1.000000000  0.61174441 -0.635031910
21: 2010-11-02  x2  0.44280504  0.15356142 -0.41594497  1.000000000  0.130022288 -0.57901168  0.056711167
22: 2010-11-02  x3  0.10088665 -0.19195643  0.47057457  0.130022288  1.000000000  0.64359334  0.947634532
23: 2010-12-02  x2  0.93528272 -0.08754048  0.47678028  1.000000000  0.640307246  0.76212510  0.129150650
24: 2010-12-02  x3  0.33922514 -0.12389088  0.74214451  0.640307246  1.000000000  0.96349801 -0.102219394
25: 2011-01-02  x2 -0.82681063  0.28813098  0.03680233  1.000000000 -0.670768877 -0.81895896  0.665160850
26: 2011-01-02  x3  0.37233376  0.44439836 -0.21498926 -0.670768877  1.000000000  0.96249265 -0.171852151
27: 2011-02-02  x2  0.78406829  0.87265449  0.44109646  1.000000000 -0.170824594 -0.31318417  0.419451424
28: 2011-02-02  x3  0.18246574 -0.53572698  0.34238233 -0.170824594  1.000000000  0.21508906  0.456834070
29: 2011-03-02  x2 -0.56012875  0.61468934  0.37723894  1.000000000 -0.628199774 -0.66904419 -0.051205610
30: 2011-03-02  x3  0.38559989  0.19407633  0.47882281 -0.628199774  1.000000000  0.26507006  0.457066183
31: 2011-04-02  x2  0.06213453  0.27072902 -0.56611872  1.000000000  0.756519929 -0.01807478 -0.240390425
32: 2011-04-02  x3 -0.34954142 -0.40531930 -0.44445227  0.756519929  1.000000000  0.44535133  0.364169201
33: 2011-05-02  x2  0.91869834 -0.52157721  0.09026303  1.000000000 -0.267341912  0.17065244 -0.958919185
34: 2011-05-02  x3 -0.07579900  0.12441121  0.43261423 -0.267341912  1.000000000  0.76093821  0.328896757
35: 2011-06-02  x2  0.14630187 -0.02121530  0.23351755  1.000000000  0.214190295 -0.44671768  0.593123871
36: 2011-06-02  x3  0.92090833 -0.22766946 -0.53092628  0.214190295  1.000000000 -0.01796403 -0.439337427
37: 2011-07-02  x2  0.24085187 -0.01442061  0.46151828  1.000000000 -0.634303063  0.60757379  0.581212136
38: 2011-07-02  x3 -0.65826708 -0.52006935 -0.17321499 -0.634303063  1.000000000 -0.38542234 -0.856055763
39: 2011-08-02  x2  0.13923866 -0.01168973  0.40294949  1.000000000  0.163579759  0.05517129 -0.006048453
40: 2011-08-02  x3  0.11823951  0.80057447 -0.26414984  0.163579759  1.000000000  0.71011089 -0.061702941
          Time var          Id        norm           y           x2           x3          x4           x5