Question

我是R的新手。我有一个大型数据框，其中包含数百万行，如下所示：

Whole   code1       P_1   Q_1   code2   P_2   Q_2   code3   P_3   Q_3
    64      a       0.2   0.1   b      0.3    0.2   d      0.1    0.9
    55      a       0.5   0.3   c      0.1    0.3   b      0.4    0.4
    70      b       0.4   0.1   d      0.2    0.5   NULL   0.7    0.7
    26      c       0.7   0.5   a      0.2    0.6   b      0.2    0.2
    47      a       0.8   0.7   d      0.1    0.2   NULL   0.6    0.8
    35      d       0.2   0.8   b      0.8    0.1   a      0.2    0.1

根据代码1，代码2和代码3中的值，我正在寻找三个输出字段。

> Output1   :   If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2   :  If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3   :  If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3

如果能更正以下代码，将不胜感激：

 df1 %>%    
  for (i in 1:6) {
    if (paste0("code", i) %in% c("a", "b")) {
      mutate (paste0("Output", i) = Whole * paste0("P_", i) )
    } else {    
      mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
    }   
  }

Answer 1

library(dplyr)
df1 %>%
  mutate(
    Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
    Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
    Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
  )
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
# 2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
# 3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
# 4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
# 5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
# 6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

如果您的数据更通用（不是硬编码或多于“ 3”组列），那么我们可以调整数据的形状，进行赋值然后将其重新成形。

library(tidyr)
df1 %>%
  rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
  pivot_longer(
    -Whole,
    names_to = c(".value", "set"),
    names_sep = "_"
  ) %>%
  mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
  pivot_wider(
    id_cols = Whole,
    names_from = set,
    values_from = c(code, P, Q, Output),
    names_sep = "_"
  )
# # A tibble: 6 x 13
#   Whole code_1 code_2 code_3   P_1   P_2   P_3   Q_1   Q_2   Q_3 Output_1 Output_2 Output_3
#   <int> <chr>  <chr>  <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>    <dbl>
# 1    64 a      b      d        0.2   0.3   0.1   0.1   0.2   0.9     12.8     19.2     57.6
# 2    55 a      c      b        0.5   0.1   0.4   0.3   0.3   0.4     27.5     16.5     22  
# 3    70 b      d      NULL     0.4   0.2   0.7   0.1   0.5   0.7     28       35       49  
# 4    26 c      a      b        0.7   0.2   0.2   0.5   0.6   0.2     13        5.2      5.2
# 5    47 a      d      NULL     0.8   0.1   0.6   0.7   0.2   0.8     37.6      9.4     37.6
# 6    35 d      b      a        0.2   0.8   0.2   0.8   0.1   0.1     28       28        7

作为旁注，通常我建议将其保留为“长”格式，而不要重新扩展。对于其他类似整洁的函数（包括ggplot2），通常首选这种“长”格式，并且很容易扩展为任意计数。这将导致这样的数据：

df1 %>%
  rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
  pivot_longer(
    -Whole,
    names_to = c(".value", "set"),
    names_sep = "_"
  ) %>%
  mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
#    Whole set   code      P     Q Output
#    <int> <chr> <chr> <dbl> <dbl>  <dbl>
#  1    64 1     a       0.2   0.1   12.8
#  2    64 2     b       0.3   0.2   19.2
#  3    64 3     d       0.1   0.9   57.6
#  4    55 1     a       0.5   0.3   27.5
#  5    55 2     c       0.1   0.3   16.5
#  6    55 3     b       0.4   0.4   22  
#  7    70 1     b       0.4   0.1   28  
#  8    70 2     d       0.2   0.5   35  
#  9    70 3     NULL    0.7   0.7   49  
# 10    26 1     c       0.7   0.5   13  
# 11    26 2     a       0.2   0.6    5.2
# 12    26 3     b       0.2   0.2    5.2
# 13    47 1     a       0.8   0.7   37.6
# 14    47 2     d       0.1   0.2    9.4
# 15    47 3     NULL    0.6   0.8   37.6
# 16    35 1     d       0.2   0.8   28  
# 17    35 2     b       0.8   0.1   28  
# 18    35 3     a       0.2   0.1    7

（短得多。）

Answer 2

我们可以使用map2。获取具有“ P”，“ Q”的列的名称。 _后跟数字。然后使用map2遍历相应的列，应用转换逻辑并将这些列与原始数据集绑定

library(dplyr)
library(purrr)
library(stringr)
ps <-  names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <-  names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
                   transmute(Output = Whole *  
       case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
            TRUE ~ !! rlang::sym(.y)))) %>% 
    rename_all(~ str_remove(., fixed("..."))) %>%     
    bind_cols(df1, .)
#   Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1    64     a 0.2 0.1     b 0.3 0.2     d 0.1 0.9    12.8    19.2     6.4
#2    55     a 0.5 0.3     c 0.1 0.3     b 0.4 0.4    27.5     5.5    22.0
#3    70     b 0.4 0.1     d 0.2 0.5  NULL 0.7 0.7    28.0    14.0    49.0
#4    26     c 0.7 0.5     a 0.2 0.6     b 0.2 0.2    13.0    15.6     5.2
#5    47     a 0.8 0.7     d 0.1 0.2  NULL 0.6 0.8    37.6     4.7    28.2
#6    35     d 0.2 0.8     b 0.8 0.1     a 0.2 0.1    28.0     3.5     3.5

数据

df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a", 
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c", 
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2, 
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL", 
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4, 
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA, 
-6L))

Answer 3

根据您的行数，此data.table方法可能会更快。

library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
  ,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
  ][Logical == TRUE,`:=`(Output1 = Whole * P_1,
                         Output2 = Whole * P_2,
                         Output3 = Whole * P_3)
    ][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
                            Output2 = Whole * Q_2,
                            Output3 = Whole * Q_3)
      ][,.(Output1,Output2,Output3)]

   Output1 Output2 Output3
1:    12.8    19.2     6.4
2:    27.5     5.5    22.0
3:    28.0    14.0    49.0
4:    13.0    15.6     5.2
5:    37.6     4.7    28.2
6:    28.0     3.5     3.5

R数据框：遍历多个列和行值

3 个答案:

数据