Question

我正在尝试找到一种有效的方法来替换代码中的 for 循环。我找到了一种解决方法，但我确信有一种更“R 友好”的方式来做到这一点。


df<-structure(list(YY = c(2020, 2020, 2021, 2021, 2021), DD = c(6, 
13, 19, 28, 3), MM = c("Fev", "Fev", "Fev", "Fev", "Mar"), Date = structure(c(18664, 
18671, 18677, 18686, 18689), class = "Date"), `ID (FIFA)` = c("FRA D1", 
"FRA D1", "FRA D1", "FRA D1", "FRA D1"), Country = c("France", 
"France", "France", "France", "France"), League = c("Ligue 1", 
"Ligue 1", "Ligue 1", "Ligue 1", "Ligue 1"), Season = c("2020/2021", 
"2020/2021", "2020/2021", "2020/2021", "2020/2021"), HOME = c("Lyon", 
"Lyon", "Brest", "Marseille", "Lyon"), AWAY = c("Strasbourg", 
"Montpellier", "Lyon", "Lyon", "Rennes"), `Final Scores` = c(3, 
1, 2, 1, 1), ...12 = c(0, 2, 3, 1, 0), ...13 = c("H", "A", "A", 
"D", "H"), ...14 = c(NA_character_, NA_character_, NA_character_, 
NA_character_, NA_character_), `ET/Pen/Awd` = c(NA_character_, 
NA_character_, NA_character_, NA_character_, NA_character_), 
    `1st Half Scores` = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), ...17 = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_), ...18 = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_), ...19 = c(NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_), `2nd Half Scores` = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...21 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...22 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_
    ), ...23 = c(NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_), FTMoneyline...24 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...25 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...26 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), `Payout, %...27` = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), FTMoneyline...28 = c(1.45, 
    1.38, 6.5, 5, 1.61), ...29 = c(5.25, 5.5, 4.85, 4.2, 4.4), 
    ...30 = c(8, 8, 1.5, 1.7, 5.9), `Payout, %...31` = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), `FT TG 2.5...32` = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...33 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), `FT TG 2.5...34` = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), ...35 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), OUTCOME = c(1, 3, 
    3, 2, 1), REFGAME = c("G236038", "G236098", "G236155", "G236247", 
    "G236276"), m1 = c(4.9, 4.96, 4.28333333333333, 3.63333333333333, 
    3.97), sd = c(3.28899680753874, 3.34287301583533, 2.54771139129481, 
    1.72143351115671, 2.17708520733572), indxcol = c(1L, 1L, 
    3L, 3L, 1L), sp1 = c(-3.8, -4.12, 1.65, 0.8, -2.79), sp2 = c(2.75, 
    2.5, -3.35, -2.5, 1.5), ovr = c(0.513136288998362, 3.14558629776023, 
    2.66983875231297, 2.63305322128851, 1.78822651188164), wavg = c(0.357142857142857, 
    0.369623655913978, 0.377431906614786, 0.385321100917431, 
    0.36943744752309), ospr = c(5.51724137931035, 5.79710144927536, 
    4.33333333333333, 2.94117647058824, 3.66459627329193)), row.names = c(NA, 
-5L), class = c("tbl_df", "tbl", "data.frame"))

我的脚本的目标是计算每个主队在之前的比赛中有多少次是博彩公司的最低小数赔率（赔率在 col df[28:30]），以及它实际对应的次数游戏的结果。这样我就可以得到一个准确率（r/t*100）。

这是我要替换的 for 循环部分的代码：

    if(is.null(nrow(df))|nrow(df)==0){
      r<-0
      return(r)
    }
    r<- 0
    t<- 0
    for(i in nrow(df):1){
      if(grep(team,df[i,])==9 & which.min(df[i,28:30])[[1]]==1){
        t<-t+1
        if(df[i,"OUTCOME"]==1){
          r<-r+1
        }
      } else if(grep(team,df[i,])==10 & which.min(df[i,28:30])[[1]]==3){
        t<-t+1
        if(df[i,"OUTCOME"]==3){
          r<-r+1
        }
      }
    }
    if(r==0 | t==0){
      return(0)
    }
    return(r/t*100)

这是我找到的解决方法，但它看起来并不理想：

    r<- 0
    t<- 0
    df1<- df[grepl(team,df$HOME) & apply(df[28:30],1,which.min)==1,]
    if(is.null(nrow(df1))|nrow(df1)==0){
      r<-0
      return(r)
    }
    r1 <-df1[df1["OUTCOME"]==1,]
    df2<- df[grepl(team,df$AWAY) & apply(df[28:30],1,which.min)==3,]
    if(is.null(nrow(df2))|nrow(df2)==0){
      r<-0
      return(r)
    }
    r2<- df2[df2["OUTCOME"]==3,]
    t<- sum(nrow(df1),nrow(df2))
    r<-sum(nrow(r1),nrow(r2))
    if(r==0 | t==0){
      return(0)
    }
    return(r/t*100)

我怎样才能以“R 友好”的方式更新它？

根据@DaveArmstrong 的评论，下面是最后一个 group_by 之后的输出：

structure(list(obs = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L
), place = c("home", "away", "home", "away", "home", "away", 
"home", "away", "home", "away"), team = c("Lyon", "Strasbourg", 
"Lyon", "Montpellier", "Brest", "Lyon", "Marseille", "Lyon", 
"Lyon", "Rennes"), r = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0)), row.names = c(NA, 
-10L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
    team = c("Brest", "Lyon", "Marseille", "Montpellier", "Rennes", 
    "Strasbourg"), .rows = list(5L, c(1L, 3L, 6L, 8L, 9L), 7L, 
        4L, 10L, 2L)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.16

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] neuralnet_1.44.2     class_7.3-15         gmodels_2.18.1       viridis_0.5.1        viridisLite_0.3.0   
 [6] plotly_4.9.1         gridExtra_2.3        slackr_1.4.2         qqplotr_0.0.4        ggplot2_3.3.0       
[11] odds.converter_1.4.8 e1071_1.7-2          tibble_2.1.3         readxl_1.3.1         dplyr_0.8.5         
[16] plyr_1.8.4           tidyr_1.1.3         

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.3        cellranger_1.1.0  pillar_1.4.3      compiler_3.6.1    DEoptimR_1.0-8    tools_3.6.1      
 [7] digest_0.6.25     jsonlite_1.6.1    lifecycle_0.2.0   gtable_0.3.0      pkgconfig_2.0.3   rlang_0.4.10     
[13] cli_2.0.2         rstudioapi_0.11   yaml_2.2.0        withr_2.1.2       httr_1.4.0        gtools_3.8.2     
[19] htmlwidgets_1.5.1 vctrs_0.3.6       grid_3.6.1        tidyselect_1.1.0  data.table_1.12.6 glue_1.3.1       
[25] robustbase_0.93-7 R6_2.4.1          fansi_0.4.1       gdata_2.18.0      purrr_0.3.3       magrittr_1.5     
[31] ellipsis_0.3.0    htmltools_0.4.0   scales_1.1.0      MASS_7.3-51.4     assertthat_0.2.1  colorspace_1.4-1 
[37] utf8_1.1.4        lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4

Answer 1

这是一个选项：

names(df)[c(28:30)] <- c("odds1", "odds2", "odds3")
df2 <- df %>% 
  rowwise %>% 
  mutate(min_2830 = which.min(c(odds1, odds2, odds3))) %>%
  ungroup %>% 
  group_by(HOME) %>% 
  mutate(r_home = as.numeric(min_2830 == 1 & OUTCOME == 1)) %>% 
  ungroup %>% 
  group_by(AWAY) %>% 
  mutate(r_away = as.numeric(min_2830 == 3 & OUTCOME == 3)) %>%  
  select(HOME, AWAY, r_home, r_away) %>% 
  set_names(c("team_home", "team_away", "r_home", "r_away")) %>% 
  ungroup %>% 
  mutate(obs=1:n()) %>% 
  pivot_longer(-obs, names_pattern="(.*)_(.*)", 
               names_to = c(".value", "place")) %>% 
  group_by(team) %>% 
  summarise(r = cumsum(r)) %>% 
  mutate(game = seq_along(r), 
         pct = (r/game)*100)
  

df2
# # A tibble: 10 x 4
# # Groups:   team [6]
#   team            r  game   pct
#   <chr>       <dbl> <int> <dbl>
# 1 Brest           0     1   0  
# 2 Lyon            1     1 100  
# 3 Lyon            1     2  50  
# 4 Lyon            2     3  66.7
# 5 Lyon            2     4  50  
# 6 Lyon            3     5  60  
# 7 Marseille       0     1   0  
# 8 Montpellier     0     1   0  
# 9 Rennes          0     1   0  
# 10 Strasbourg      0     1   0

sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] modelr_0.1.8    forcats_0.5.0   stringr_1.4.0   dplyr_1.0.3     purrr_0.3.4     readr_1.4.0     tidyr_1.1.2    
 [8] tibble_3.0.5    ggplot2_3.3.2   tidyverse_1.3.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6       cellranger_1.1.0 pillar_1.4.7     compiler_4.0.3   dbplyr_1.4.3     tools_4.0.3     
 [7] jsonlite_1.7.1   lubridate_1.7.9  lifecycle_0.2.0  nlme_3.1-149     gtable_0.3.0     lattice_0.20-41 
[13] pkgconfig_2.0.3  rlang_0.4.10     reprex_0.3.0     cli_2.2.0        DBI_1.1.0        rstudioapi_0.11 
[19] haven_2.3.1      withr_2.3.0      xml2_1.3.2       httr_1.4.2       fs_1.4.1         generics_0.1.0  
[25] vctrs_0.3.6      hms_0.5.3        grid_4.0.3       tidyselect_1.1.0 glue_1.4.2       R6_2.5.0        
[31] fansi_0.4.2      readxl_1.3.1     magrittr_2.0.1   backports_1.1.10 scales_1.1.1     ellipsis_0.3.1  
[37] rvest_0.3.6      assertthat_0.2.1 colorspace_1.4-1 utf8_1.1.4       stringi_1.5.3    munsell_0.5.0   
[43] broom_0.5.6      crayon_1.3.4

替换 for 循环以提高效率

1 个答案: