我想基于if / else if / else语句创建一个新的变量(取决于数据是宽格式还是长格式)。我熟悉使用mutate,但似乎无法使它适用于多个变量。我看到有很多类似的帖子,并尝试了提供的解决方案,但没有成功。
以下是宽格式数据的示例:
structure(list(distribution = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("bimodal",
"extreme_left", "extreme_right", "left_skew", "right_skew", "trunc_norm_left",
"trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "uniform"
), class = "factor"), Rating_1 = c(25L, 15L, 15L, 5L, 5L, 4L,
4L, 18L, 5L, 9L, 25L, 7L, 2L, 1L, 5L, 12L, 5L, 13L, 7L, 11L),
Rating_2 = c(0L, 8L, 6L, 4L, 5L, 11L, 8L, 7L, 12L, 6L, 0L,
11L, 10L, 1L, 11L, 10L, 10L, 8L, 10L, 5L), Rating_3 = c(0L,
4L, 9L, 9L, 13L, 20L, 10L, 0L, 16L, 8L, 0L, 14L, 26L, 5L,
16L, 6L, 20L, 8L, 13L, 9L), Rating_4 = c(0L, 8L, 6L, 19L,
19L, 11L, 13L, 7L, 12L, 12L, 0L, 11L, 10L, 9L, 13L, 10L,
10L, 8L, 12L, 9L), Rating_5 = c(25L, 15L, 14L, 13L, 8L, 4L,
15L, 18L, 5L, 15L, 25L, 7L, 2L, 34L, 5L, 12L, 5L, 13L, 8L,
16L)), row.names = c(NA, 20L), class = "data.frame")
这是我尝试的策略之一:
df %>%
arrange(distribution) %>%
if (distribution == "bimodal") {
mutate(Dist1 = Rating_1 * 20 - 5,
Dist2 = Rating_2 * 20 - 0,
Dist3 = Rating_3 * 20 - 0,
Dist4 = Rating_4 * 20 - 0,
Dist5 = Rating_5 * 20 - 0)
} else if (distribution == "extreme_left") {
mutate(Dist1 = Rating_1 * 20 - 0,
Dist2 = Rating_2 * 20 - 0,
Dist3 = Rating_3 * 20 - 1,
Dist4 = Rating_4 * 20 - 2,
Dist5 = Rating_5 * 20 - 6)
} else if (distribution == "extreme_right") {
mutate (Dist1 = Rating_1 * 20 - 6,
Dist2 = Rating_2 * 20 - 3,
Dist3 = Rating_3 * 20 - 1,
Dist4 = Rating_4 * 20 - 0,
Dist5 = Rating_5 * 20 - 0)
} else if (distribution == "left_skew") {
mutate (Dist1 = Rating_1 * 20 - 1,
Dist2 = Rating_2 * 20 - 1,
Dist3 = Rating_3 * 20 - 2,
Dist4 = Rating_4 * 20 - 2,
Dist5 = Rating_5 * 20 - 4)
} else if (distribution == "right_skew") {
mutate (Dist1 = Rating_1 * 20 - 4,
Dist2 = Rating_2 * 20 - 2,
Dist3 = Rating_3 * 20 - 2,
Dist4 = Rating_4 * 20 - 1,
Dist5 = Rating_5 * 20 - 1)
} else if (distribution == "trunc_norm_left") {
mutate (Dist1 = Rating_1 * 20 - 0,
Dist2 = Rating_2 * 20 - 0,
Dist3 = Rating_3 * 20 - 6,
Dist4 = Rating_4 * 20 - 3,
Dist5 = Rating_5 * 20 - 1)
} else if (distribution == "trunc_norm_right") {
mutate (Dist1 = Rating_1 * 20 - 1,
Dist2 = Rating_2 * 20 - 3,
Dist3 = Rating_3 * 20 - 6,
Dist4 = Rating_4 * 20 - 0,
Dist5 = Rating_5 * 20 - 0)
} else if (distribution == "trunc_uni_left") {
mutate (Dist1 = Rating_1 * 20 - 0,
Dist2 = Rating_2 * 20 - 0,
Dist3 = Rating_3 * 20 - 4,
Dist4 = Rating_4 * 20 - 3,
Dist5 = Rating_5 * 20 - 3)
} else if (distribution "trunc_uni_right") {
mutate(Dist1 = Rating_1 * 20 - 3,
Dist2 = Rating_2 * 20 - 3,
Dist3 = Rating_3 * 20 - 4,
Dist4 = Rating_4 * 20 - 0,
Dist5 = Rating_5 * 20 - 0)
} else {
mutate (Dist1 = Rating_1 * 20 - 2,
Dist2 = Rating_2 * 20 - 2,
Dist3 = Rating_3 * 20 - 2,
Dist4 = Rating_4 * 20 - 2,
Dist5 = Rating_5 * 20 - 2)
}
我尝试过使用长格式和宽格式数据,尽管我意识到长格式数据还缺少一些东西。
如果数据是宽格式,我正在寻找类似的东西。
structure(list(distribution = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("bimodal",
"extreme_left", "extreme_right", "left_skew", "right_skew", "trunc_norm_left",
"trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "uniform"
), class = "factor"), Rating_1 = c(25L, 15L, 15L, 5L, 5L, 4L,
4L, 18L, 5L, 9L, 25L, 7L, 2L, 1L, 5L, 12L, 5L, 13L, 7L, 11L),
Rating_2 = c(0L, 8L, 6L, 4L, 5L, 11L, 8L, 7L, 12L, 6L, 0L,
11L, 10L, 1L, 11L, 10L, 10L, 8L, 10L, 5L), Rating_3 = c(0L,
4L, 9L, 9L, 13L, 20L, 10L, 0L, 16L, 8L, 0L, 14L, 26L, 5L,
16L, 6L, 20L, 8L, 13L, 9L), Rating_4 = c(0L, 8L, 6L, 19L,
19L, 11L, 13L, 7L, 12L, 12L, 0L, 11L, 10L, 9L, 13L, 10L,
10L, 8L, 12L, 9L), Rating_5 = c(25L, 15L, 14L, 13L, 8L, 4L,
15L, 18L, 5L, 15L, 25L, 7L, 2L, 34L, 5L, 12L, 5L, 13L, 8L,
16L), Dist1 = c(495, 295, 295, 95, 95, 75, 75, 355, 95, 175,
495, 135, 35, 15, 95, 235, 95, 255, 135, 215), Dist2 = c(0,
160, 120, 80, 100, 220, 160, 140, 240, 120, 0, 220, 200,
20, 220, 200, 200, 160, 200, 100), Dist3 = c(0, 80, 180,
180, 260, 400, 200, 0, 320, 160, 0, 280, 520, 100, 320, 120,
400, 160, 260, 180), Dist4 = c(0, 160, 120, 380, 380, 220,
260, 140, 240, 240, 0, 220, 200, 180, 260, 200, 200, 160,
240, 180), Dist5 = c(495, 295, 275, 255, 155, 75, 295, 355,
95, 295, 495, 135, 35, 675, 95, 235, 95, 255, 155, 315)), row.names = c(NA,
20L), class = "data.frame")
任何帮助将不胜感激。
答案 0 :(得分:0)
可能不是最佳解决方案:
rating_vars <- str_subset(names(df), "^Rating_")
N_ratings <- length(rating_vars)
scalar_data <- data.frame(
scalar = c(
5,0,0,0,0,0,0,1,2,6,6,3,1,0,0,1,1,2,2,4,4,2,2,1,1,0,0,6,3,1,1,3,6,0,0,0,0,4,3,3,3,3,4,0,0,2,2,2,2,2
),
distribution = rep(
c(
"bimodal", "extreme_left", "extreme_right", "left_skew",
"right_skew", "trunc_norm_left", "trunc_norm_right", "trunc_uni_left", "trunc_uni_right", "else"
),
each = N_ratings
),
rating = rep(rating_vars, N_ratings)
)
df <- df %>%
mutate(id = seq_len(n())) %>%
pivot_longer(contains("Rating"), names_to = "rating", values_to = "rating_value") %>%
left_join(scalar_data, by = c("distribution", "rating")) %>%
mutate(
Distribution = str_replace(rating, "Rating", "Distribution"),
distribution_value = rating_value * 20 - scalar
) %>%
pivot_wider(
id_cols = c(id, distribution),
names_from = c(rating, Distribution),
values_from = c(rating_value, distribution_value)
)
names(df) <- case_when(
str_detect(names(df), "rating_value") ~ str_extract(names(df), "Rating_[1-9]+"),
str_detect(names(df), "distribution_value") ~str_extract(names(df), "Distribution_[1-9]+"),
TRUE ~ names(df)
)