将3列连接为线性回归模型

时间:2018-08-20 13:03:44

标签: r

这是我的data.frame:

df.index= dput(df.index)
    structure(list(Var1 = structure(c(43L, 42L, 46L, 33L, 29L), .Label = c("ABEV3", 
    "AEDU3", "ALLL3", "BBAS3", "BBDC3", "BBDC4", "BISA3", "BRAP4", 
    "BRFS3", "BRKM5", "BRML3", "BRPR3", "BVMF3", "CCRO3", "CESP6", 
    "CIEL3", "CMIG4", "CPFE3", "CPLE6", "CRUZ3", "CSAN3", "CSNA3", 
    "CTIP3", "CYRE3", "DASA3", "DTEX3", "ECOR3", "ELET3", "ELET6", 
    "ELPL4", "EMBR3", "ENBR3", "ESTC3", "EVEN3", "FIBR3", "GFSA3", 
    "GGBR4", "GOAU4", "GOLL4", "HGTX3", "HYPE3", "ITSA4", "ITUB4", 
    "JBSS3", "KLBN4", "KROT3", "LAME4", "LIGT3", "LREN3", "MRFG3", 
    "MRVE3", "NATU3", "OIBR4", "PCAR4", "PDGR3", "PETR3", "PETR4", 
    "QUAL3", "RENT3", "RSID3", "SANB11", "SBSP3", "SUZB5", "TBLE3", 
    "TIMP3", "UGPA3", "USIM5", "VALE3", "VALE5", "VIVT4"), class = "factor"), 
        Var2 = structure(c(42L, 43L, 33L, 46L, 28L), .Label = c("ABEV3", 
        "AEDU3", "ALLL3", "BBAS3", "BBDC3", "BBDC4", "BISA3", "BRAP4", 
        "BRFS3", "BRKM5", "BRML3", "BRPR3", "BVMF3", "CCRO3", "CESP6", 
        "CIEL3", "CMIG4", "CPFE3", "CPLE6", "CRUZ3", "CSAN3", "CSNA3", 
        "CTIP3", "CYRE3", "DASA3", "DTEX3", "ECOR3", "ELET3", "ELET6", 
        "ELPL4", "EMBR3", "ENBR3", "ESTC3", "EVEN3", "FIBR3", "GFSA3", 
        "GGBR4", "GOAU4", "GOLL4", "HGTX3", "HYPE3", "ITSA4", "ITUB4", 
        "JBSS3", "KLBN4", "KROT3", "LAME4", "LIGT3", "LREN3", "MRFG3", 
        "MRVE3", "NATU3", "OIBR4", "PCAR4", "PDGR3", "PETR3", "PETR4", 
        "QUAL3", "RENT3", "RSID3", "SANB11", "SBSP3", "SUZB5", "TBLE3", 
        "TIMP3", "UGPA3", "USIM5", "VALE3", "VALE5", "VIVT4"), class = "factor"), 
        time = structure(c(1L, 1L, 1L, 1L, 1L), class = "factor", .Label = "t")), class = "data.frame", row.names = c(NA, 
    -5L))

它是这样的:

   Var1  Var2 time
1 ITUB4 ITSA4    t
2 ITSA4 ITUB4    t
3 KROT3 ESTC3    t
4 ESTC3 KROT3    t
5 ELET6 ELET3    t

我想将这3列连接成这样的文本:

"ITUB4~ITSA4+t" "ITSA4~ITUB4+t" "KROT3~ESTC3+t" "ESTC3~KROT3+t" "ELET6+ELET3+t"

我正在使用apply函数:

df.index=apply(df.index,1,paste,collapse="~+")

但是结果是错误的。问题是我无法使用“ +”符号将第二列与第三列分开。如何将第二个变量与带有“ +”符号的“ t”变量分开?

我想要的结果是:

"ITUB4~ITSA4+t" "ITSA4~ITUB4+t" "KROT3~ESTC3+t" "ESTC3~KROT3+t" "ELET6+ELET3+t"

如上所述。

2 个答案:

答案 0 :(得分:3)

我们可以使用paste

with(df.index, paste0(Var1, "~", Var2, "+", time))
#[1] "ITUB4~ITSA4+t" "ITSA4~ITUB4+t" "KROT3~ESTC3+t" "ESTC3~KROT3+t" "ELET6~ELET3+t"

正如OP提到的用apply获取结果一样,将MARGIN指定为1(按行),然后在数据集的每一行中应用paste。由于paste被向量化,效率会降低

apply(df.index, 1, FUN = function(x) paste0(x[1], "~", x[2], "+", x[3]))

答案 1 :(得分:2)

如果您想要每个公式(类公式),可以执行以下操作。请注意,我首先将所有因素更改为带有mutate_if

的字符
library(tidyverse)

df <- df %>% mutate_if(is.factor, as.character) %>%
  mutate(forms = map2(Var1, Var2, ~reformulate(c(.y, "t"), .x, TRUE)))
df
#>    Var1  Var2 time             forms
#> 1 ITUB4 ITSA4    t ITUB4 ~ ITSA4 + t
#> 2 ITSA4 ITUB4    t ITSA4 ~ ITUB4 + t
#> 3 KROT3 ESTC3    t KROT3 ~ ESTC3 + t
#> 4 ESTC3 KROT3    t ESTC3 ~ KROT3 + t
#> 5 ELET6 ELET3    t ELET6 ~ ELET3 + t

df$forms
#> [[1]]
#> ITUB4 ~ ITSA4 + t
#> <environment: 0x7fe3b5854c88>
#> 
#> [[2]]
#> ITSA4 ~ ITUB4 + t
#> <environment: 0x7fe3b583d1a8>
#> 
#> [[3]]
#> KROT3 ~ ESTC3 + t
#> <environment: 0x7fe3b58352f8>
#> 
#> [[4]]
#> ESTC3 ~ KROT3 + t
#> <environment: 0x7fe3b58333a8>
#> 
#> [[5]]
#> ELET6 ~ ELET3 + t
#> <environment: 0x7fe3b581c8a8>

reprex package(v0.2.0)于2018-08-20创建。