在if / else if / else语句中使用mutate创建多个新变量

时间:2020-07-09 02:44:38

标签: r dplyr conditional-statements tidyr mutate

我想基于if / else if / else语句创建一个新的变量(取决于数据是宽格式还是长格式)。我熟悉使用mutate,但似乎无法使它适用于多个变量。我看到有很多类似的帖子,并尝试了提供的解决方案,但没有成功。

以下是宽格式数据的示例:

structure(list(distribution = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("bimodal", 
"extreme_left", "extreme_right", "left_skew", "right_skew", "trunc_norm_left", 
"trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "uniform"
), class = "factor"), Rating_1 = c(25L, 15L, 15L, 5L, 5L, 4L, 
4L, 18L, 5L, 9L, 25L, 7L, 2L, 1L, 5L, 12L, 5L, 13L, 7L, 11L), 
    Rating_2 = c(0L, 8L, 6L, 4L, 5L, 11L, 8L, 7L, 12L, 6L, 0L, 
    11L, 10L, 1L, 11L, 10L, 10L, 8L, 10L, 5L), Rating_3 = c(0L, 
    4L, 9L, 9L, 13L, 20L, 10L, 0L, 16L, 8L, 0L, 14L, 26L, 5L, 
    16L, 6L, 20L, 8L, 13L, 9L), Rating_4 = c(0L, 8L, 6L, 19L, 
    19L, 11L, 13L, 7L, 12L, 12L, 0L, 11L, 10L, 9L, 13L, 10L, 
    10L, 8L, 12L, 9L), Rating_5 = c(25L, 15L, 14L, 13L, 8L, 4L, 
    15L, 18L, 5L, 15L, 25L, 7L, 2L, 34L, 5L, 12L, 5L, 13L, 8L, 
    16L)), row.names = c(NA, 20L), class = "data.frame")

这是我尝试的策略之一:

df %>%
  arrange(distribution) %>%
  if (distribution == "bimodal") {
    mutate(Dist1 = Rating_1 * 20 - 5,
           Dist2 = Rating_2 * 20 - 0,
           Dist3 = Rating_3 * 20 - 0,
           Dist4 = Rating_4 * 20 - 0,
           Dist5 = Rating_5 * 20 - 0)
  } else if (distribution == "extreme_left") {
    mutate(Dist1 = Rating_1 * 20 - 0,
           Dist2 = Rating_2 * 20 - 0,
           Dist3 = Rating_3 * 20 - 1,
           Dist4 = Rating_4 * 20 - 2,
           Dist5 = Rating_5 * 20 - 6)
  } else if (distribution == "extreme_right") {
    mutate (Dist1 = Rating_1 * 20 - 6,
            Dist2 = Rating_2 * 20 - 3,
            Dist3 = Rating_3 * 20 - 1,
            Dist4 = Rating_4 * 20 - 0,
            Dist5 = Rating_5 * 20 - 0)
  } else if (distribution == "left_skew") {
    mutate (Dist1 = Rating_1 * 20 - 1,
            Dist2 = Rating_2 * 20 - 1,
            Dist3 = Rating_3 * 20 - 2,
            Dist4 = Rating_4 * 20 - 2,
            Dist5 = Rating_5 * 20 - 4)
  } else if (distribution == "right_skew") {
    mutate (Dist1 = Rating_1 * 20 - 4,
            Dist2 = Rating_2 * 20 - 2,
            Dist3 = Rating_3 * 20 - 2,
            Dist4 = Rating_4 * 20 - 1,
            Dist5 = Rating_5 * 20 - 1)
  } else if (distribution == "trunc_norm_left") {
    mutate (Dist1 = Rating_1 * 20 - 0,
            Dist2 = Rating_2 * 20 - 0,
            Dist3 = Rating_3 * 20 - 6,
            Dist4 = Rating_4 * 20 - 3,
            Dist5 = Rating_5 * 20 - 1)
  } else if (distribution == "trunc_norm_right") {
    mutate (Dist1 = Rating_1 * 20 - 1,
            Dist2 = Rating_2 * 20 - 3,
            Dist3 = Rating_3 * 20 - 6,
            Dist4 = Rating_4 * 20 - 0,
            Dist5 = Rating_5 * 20 - 0)
  } else if (distribution == "trunc_uni_left") {
    mutate (Dist1 = Rating_1 * 20 - 0,
            Dist2 = Rating_2 * 20 - 0,
            Dist3 = Rating_3 * 20 - 4,
            Dist4 = Rating_4 * 20 - 3,
            Dist5 = Rating_5 * 20 - 3)
  } else if (distribution "trunc_uni_right") {
    mutate(Dist1 = Rating_1 * 20 - 3,
           Dist2 = Rating_2 * 20 - 3,
           Dist3 = Rating_3 * 20 - 4,
           Dist4 = Rating_4 * 20 - 0,
           Dist5 = Rating_5 * 20 - 0)
  } else {
    mutate (Dist1 = Rating_1 * 20 - 2,
            Dist2 = Rating_2 * 20 - 2,
            Dist3 = Rating_3 * 20 - 2,
            Dist4 = Rating_4 * 20 - 2,
            Dist5 = Rating_5 * 20 - 2)
  }

我尝试过使用长格式和宽格式数据,尽管我意识到长格式数据还缺少一些东西。

如果数据是宽格式,我正在寻找类似的东西。

structure(list(distribution = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("bimodal", 
"extreme_left", "extreme_right", "left_skew", "right_skew", "trunc_norm_left", 
"trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "uniform"
), class = "factor"), Rating_1 = c(25L, 15L, 15L, 5L, 5L, 4L, 
4L, 18L, 5L, 9L, 25L, 7L, 2L, 1L, 5L, 12L, 5L, 13L, 7L, 11L), 
    Rating_2 = c(0L, 8L, 6L, 4L, 5L, 11L, 8L, 7L, 12L, 6L, 0L, 
    11L, 10L, 1L, 11L, 10L, 10L, 8L, 10L, 5L), Rating_3 = c(0L, 
    4L, 9L, 9L, 13L, 20L, 10L, 0L, 16L, 8L, 0L, 14L, 26L, 5L, 
    16L, 6L, 20L, 8L, 13L, 9L), Rating_4 = c(0L, 8L, 6L, 19L, 
    19L, 11L, 13L, 7L, 12L, 12L, 0L, 11L, 10L, 9L, 13L, 10L, 
    10L, 8L, 12L, 9L), Rating_5 = c(25L, 15L, 14L, 13L, 8L, 4L, 
    15L, 18L, 5L, 15L, 25L, 7L, 2L, 34L, 5L, 12L, 5L, 13L, 8L, 
    16L), Dist1 = c(495, 295, 295, 95, 95, 75, 75, 355, 95, 175, 
    495, 135, 35, 15, 95, 235, 95, 255, 135, 215), Dist2 = c(0, 
    160, 120, 80, 100, 220, 160, 140, 240, 120, 0, 220, 200, 
    20, 220, 200, 200, 160, 200, 100), Dist3 = c(0, 80, 180, 
    180, 260, 400, 200, 0, 320, 160, 0, 280, 520, 100, 320, 120, 
    400, 160, 260, 180), Dist4 = c(0, 160, 120, 380, 380, 220, 
    260, 140, 240, 240, 0, 220, 200, 180, 260, 200, 200, 160, 
    240, 180), Dist5 = c(495, 295, 275, 255, 155, 75, 295, 355, 
    95, 295, 495, 135, 35, 675, 95, 235, 95, 255, 155, 315)), row.names = c(NA, 
20L), class = "data.frame")

任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

可能不是最佳解决方案:

rating_vars <- str_subset(names(df), "^Rating_")
N_ratings <- length(rating_vars)

scalar_data <- data.frame(
  scalar = c(
    5,0,0,0,0,0,0,1,2,6,6,3,1,0,0,1,1,2,2,4,4,2,2,1,1,0,0,6,3,1,1,3,6,0,0,0,0,4,3,3,3,3,4,0,0,2,2,2,2,2
  ),
  distribution = rep(
    c(
      "bimodal", "extreme_left", "extreme_right", "left_skew", 
      "right_skew", "trunc_norm_left", "trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "else"
    ),
    each = N_ratings
  ),
  rating = rep(rating_vars, N_ratings)
)

df <- df %>%
  mutate(id = seq_len(n())) %>%
  pivot_longer(contains("Rating"), names_to = "rating", values_to = "rating_value") %>%
  left_join(scalar_data, by = c("distribution", "rating")) %>%
  mutate(
    Distribution = str_replace(rating, "Rating", "Distribution"),
    distribution_value = rating_value * 20 - scalar
  ) %>%
  pivot_wider(
    id_cols = c(id, distribution),
    names_from = c(rating, Distribution),
    values_from = c(rating_value, distribution_value)
  )

names(df) <- case_when(
  str_detect(names(df), "rating_value") ~ str_extract(names(df), "Rating_[1-9]+"),
  str_detect(names(df), "distribution_value") ~str_extract(names(df), "Distribution_[1-9]+"),
  TRUE ~ names(df)
)