dplyr:Group_by()并运行回归

时间:2018-10-01 15:28:36

标签: r dplyr

我希望从多个模型中进行回归(提取beta)。

在这里,我有3家公司(BABA,GOOG,AMZN)和1家基准(SPY)。我正在尝试group_by(symbol)并对基准上的每个公司进行回归分析。

因此BABA将在SPY上运行

GOOG将是第二次在SPY上运行

最后AMZN将是SPY的第三次运行

所以每个公司都使用固定的SPY

我一直在尝试以下运气;

df %>% 
  group_by(symbol) %>%
  do(mod = lm(SPY ~ ., data = .))

数据:

df <- structure(list(symbol = c("BABA", "BABA", "BABA", "BABA", "BABA", 
"BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", 
"BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", 
"BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", 
"BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "BABA", "GOOG", 
"GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", 
"GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", 
"GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", 
"GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", "GOOG", 
"GOOG", "GOOG", "GOOG", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", 
"AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", 
"AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", 
"AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", 
"AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "AMZN", "SPY", 
"SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", 
"SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", 
"SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", 
"SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY", "SPY"), date = structure(c(16708, 
16738, 16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 
17011, 17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 
17284, 17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 
17562, 17590, 17619, 17651, 17682, 17711, 17743, 17774, 16708, 
16738, 16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 
17011, 17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 
17284, 17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 
17562, 17590, 17619, 17651, 17682, 17711, 17743, 17774, 16708, 
16738, 16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 
17011, 17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 
17284, 17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 
17562, 17590, 17619, 17651, 17682, 17711, 17743, 17774, 16708, 
16738, 16769, 16800, 16829, 16860, 16891, 16920, 16952, 16982, 
17011, 17044, 17074, 17105, 17135, 17165, 17197, 17225, 17256, 
17284, 17317, 17347, 17378, 17409, 17438, 17470, 17500, 17529, 
17562, 17590, 17619, 17651, 17682, 17711, 17743, 17774), class = "Date"), 
    close = c(58.970001, 83.830002, 84.080002, 81.269997, 67.029999, 
    68.809998, 79.029999, 76.940002, 82, 79.529999, 82.480003, 
    97.190002, 105.790001, 101.690002, 94.019997, 87.809998, 
    101.309998, 102.900002, 107.830002, 115.5, 122.459999, 140.899994, 
    154.949997, 171.740005, 172.710007, 184.889999, 177.080002, 
    172.429993, 204.289993, 186.139999, 183.539993, 178.539993, 
    198.009995, 185.529999, 187.229996, 175.009995, 608.419983, 
    710.809998, 742.599976, 758.880005, 742.950012, 697.77002, 
    744.950012, 693.01001, 735.719971, 692.099976, 768.789978, 
    767.049988, 777.289978, 784.539978, 758.039978, 771.820007, 
    796.789978, 823.210022, 829.559998, 905.960022, 964.859985, 
    908.72998, 930.5, 939.330017, 959.109985, 1016.640015, 1021.409973, 
    1046.400024, 1169.939941, 1104.72998, 1031.790039, 1017.330017, 
    1084.98999, 1115.650024, 1217.26001, 1218.189941, 511.890015, 
    625.900024, 664.799988, 675.890015, 587, 552.52002, 593.640015, 
    659.590027, 722.789978, 715.619995, 758.809998, 769.159973, 
    837.309998, 789.820007, 750.570007, 749.869995, 823.47998, 
    845.039978, 886.539978, 924.98999, 994.619995, 968, 987.780029, 
    980.599976, 961.349976, 1105.280029, 1176.75, 1169.469971, 
    1450.890015, 1512.449951, 1447.339966, 1566.130005, 1629.619995, 
    1699.800049, 1777.439941, 2012.709961, 191.589996, 207.929993, 
    208.690002, 203.869995, 193.720001, 193.559998, 205.520004, 
    206.330002, 209.839996, 209.479996, 217.119995, 217.380005, 
    216.300003, 212.550003, 220.380005, 223.529999, 227.529999, 
    236.470001, 235.740005, 238.080002, 241.440002, 241.800003, 
    246.770004, 247.490005, 251.229996, 257.149994, 265.01001, 
    266.859985, 281.899994, 271.649994, 263.149994, 264.51001, 
    270.940002, 271.279999, 281.329987, 290.309998), returns = c(NA, 
    0.421570299786836, 0.00298222586228736, -0.0334206105275782, 
    -0.17521838963523, 0.0265552592354952, 0.14852494255268, 
    -0.0264456159236444, 0.0657655038792433, -0.0301219634146341, 
    0.0370929716722364, 0.178346247150355, 0.0884864576913991, 
    -0.0387560162703845, -0.0754253599090302, -0.0660497681147555, 
    0.153741035274821, 0.015694443109159, 0.047910591877345, 
    0.0711304633009282, 0.0602597316017315, 0.150579741552995, 
    0.0997161362547681, 0.108357588416087, 0.00564808414906004, 
    0.0705227925791236, -0.0422413166869019, -0.0262593683503574, 
    0.184770639061616, -0.0888442636541674, -0.0139680133983453, 
    -0.0272420191276787, 0.109051208487501, -0.0630271012329453, 
    0.00916292248780737, -0.0652673250070464, NA, 0.168288382796263, 
    0.0447235943352615, 0.0219230130974311, -0.020991451738144, 
    -0.0608116175654627, 0.0676153899532685, -0.0697228017495488, 
    0.06162964514755, -0.0592888554332855, 0.110807693482712, 
    -0.00226328392641983, 0.0133498339876124, 0.00932727837126435, 
    -0.0337777560648413, 0.0181784990237019, 0.0323520649549578, 
    0.0331581027993302, 0.00771367674141366, 0.0920970444382494, 
    0.0650138654793755, -0.0581742489818355, 0.0239565332707523, 
    0.00948954003224078, 0.0210575278571132, 0.0599827245047395, 
    0.00469188496382378, 0.0244662296830735, 0.118061844578092, 
    -0.0557378705647592, -0.0660251304124109, -0.0140145004830774, 
    0.0665073986507567, 0.0282583565586627, 0.0910769361485713, 
    0.000763954284508372, NA, 0.222723643085712, 0.0621504433749629, 
    0.0166817496994298, -0.131515502562943, -0.0587393185689948, 
    0.0744226335907248, 0.111094283292207, 0.0958170202897868, 
    -0.00991987052703713, 0.0603532647239684, 0.0136397451631891, 
    0.0886031871031852, -0.0567173342172369, -0.0496948667444936, 
    -0.000932640517835148, 0.0981636623558995, 0.0261815691014129, 
    0.0491101025755256, 0.0433708721029611, 0.0752764956948344, 
    -0.0267639853751381, 0.0204339142561984, -0.00726887848428048, 
    -0.0196308387427495, 0.149716603311175, 0.0646623200680305, 
    -0.00618655534310597, 0.240638965495934, 0.0424290851570854, 
    -0.0430493484805567, 0.0820747314318273, 0.0405394123076008, 
    0.043065287745196, 0.0456758970242859, 0.13236453990543, 
    NA, 0.0852862745505771, 0.00365511963442433, -0.0230964921836553, 
    -0.0497866005245156, -0.000825949820225325, 0.0617896575923709, 
    0.00394121245735279, 0.0170115541413121, -0.00171559286533729, 
    0.0364712580956894, 0.00119754055816013, -0.0049682674356365, 
    -0.0173370316596806, 0.0368383998564328, 0.0142934655074538, 
    0.0178946898308714, 0.0392915309598361, -0.00308705542738164, 
    0.00992617693377928, 0.0141129031072504, 0.00149105780739678, 
    0.0205541808864245, 0.00291770064565866, 0.0151116850153201, 
    0.0235640572155245, 0.0305658805498554, 0.00698077404698783, 
    0.0563591765172287, -0.0363604122673377, -0.0312902638974474, 
    0.00516821596431427, 0.0243090686813703, 0.00125487929980883, 
    0.037046549826919, 0.0319198500513918)), .Names = c("symbol", 
"date", "close", "returns"), row.names = c(NA, -144L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), vars = "symbol", indices = list(
    72:107, 0:35, 36:71, 108:143), group_sizes = c(36L, 36L, 
36L, 36L), biggest_group_size = 36L, labels = structure(list(
    symbol = c("AMZN", "BABA", "GOOG", "SPY")), row.names = c(NA, 
-4L), class = "data.frame", vars = "symbol", .Names = "symbol"))

编辑:

我得到了不同的结果:

这是基本模型:

symbols <- c("GOOG")

start_date_beta = "2015-09-01"
end_date_beta = "2018-09-01"

library(tidyquant)
stock_returns <- symbols %>%
  tq_get(get = "stock.prices",
         from =  start_date_beta,
         to =  end_date_beta) %>%
  tq_transmute(select = open:volume,
               mutate_fun = to.period,
               period = "months") %>%
  select(date, close)


bench <- "SPY"
bench_returns <- bench %>%
  tq_get(get = "stock.prices",
         from =  start_date_beta,
         to =  end_date_beta) %>%
  tq_transmute(select = open:volume,
               mutate_fun = to.period,
               period = "months") %>%
  select(date, close)

returns <- full_join(stock_returns, bench_returns, by = "date")
colnames(returns) <- c("date", "GOOG", "SPY")


returns$GOOGret <- Delt(returns$GOOG)
returns$SPYret <- Delt(returns$SPY)

lm(returns$GOOGret ~ returns$SPYret)$coeff[[2]]

> lm(returns$GOOGret ~ returns$SPYret)$coeff[[2]]
[1] 1.412548

这是更高级的模型

symbols_beta <- c("BABA", "GOOG", "AMZN")
start_date_beta = "2015-09-01"
end_date_beta = "2018-09-01"

library(tidyquant)
stock_prices <- symbols_beta %>%
  tq_get(get = "stock.prices",
         from =  start_date_beta,
         to =  end_date_beta)

stock_periods <- stock_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = open:volume,
               mutate_fun = to.period,
               period = "months") %>%
  select(symbol, date, close)


bench <- "SPY"
bench_prices <- bench %>%
  tq_get(get = "stock.prices",
         from =  start_date_beta,
         to =  end_date_beta)

bench_prices$symbol <- "SPY"


bench_periods <- bench_prices %>%
  group_by(symbol) %>%
  tq_transmute(select = open:volume,
               mutate_fun = to.period,
               period = "months") %>%
  select(symbol, date, close)

returns <- rbind(stock_periods, bench_periods)

returns <- returns %>%
  group_by(symbol) %>%
  mutate(returns = Delt(close))


library(tidyverse)

returns = ungroup(returns)

imap_dbl(unique(returns$symbol)[unique(returns$symbol) != "SPY"] %>% set_names(),

         ~ left_join(returns %>% filter(symbol=="SPY") %>% 
                       select(date, spy_returns=returns),
                     returns %>% filter(symbol==.x) %>% 
                       select(date, !!.y:=returns),
                     by="date") %>% 
           lm(paste("spy_returns ~ ", .y), data=.) %>% 
           coef() %>% .[2]
)

     BABA      GOOG      AMZN 
0.1533903 0.3290332 0.2306936 

1 个答案:

答案 0 :(得分:0)

您需要将SPY的返回值和第二个符号放在单独的列中。您可以使用imap包中的purrr遍历非SPY股票代码。例如,在下面的代码中,对每个非SPY符号进行迭代,我们使用left_join(在date上连接)创建SPY收益和其他股票收益的数据框。然后我们进行回归,并将每个回归的第二个系数作为命名向量返回:

library(tidyverse)

df = ungroup(df)

imap_dbl(unique(df$symbol)[unique(df$symbol) != "SPY"] %>% set_names(),

         ~ left_join(df %>% filter(symbol=="SPY") %>% 
                       select(date, spy_returns=returns),
                     df %>% filter(symbol==.x) %>% 
                       select(date, !!.y:=returns),
                     by="date") %>% 
           lm(paste("spy_returns ~ ", .y), data=.) %>% 
           coef() %>% .[2]
)
     BABA      GOOG      AMZN 
0.1533903 0.3290332 0.2306936