我是R的新手。我有一个大型数据框,其中包含数百万行,如下所示:
Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3
64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9
55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4
70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7
26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2
47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8
35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1
根据代码1,代码2和代码3中的值,我正在寻找三个输出字段。
> Output1 : If code1 is 'a' or 'b', then Output1 = Whole*P_1, else Output1 = Whole* Q_1
> Output2 : If code1 is 'a' or 'b', then Output1 = Whole*P_2, else Output2 = Whole* Q_2
> Output3 : If code1 is 'a' or 'b', then Output1 = Whole*P_3, else Output3 = Whole* Q_3
如果能更正以下代码,将不胜感激:
df1 %>%
for (i in 1:6) {
if (paste0("code", i) %in% c("a", "b")) {
mutate (paste0("Output", i) = Whole * paste0("P_", i) )
} else {
mutate (paste0("Output", i) = Whole * paste0("Q_", i) )
}
}
答案 0 :(得分:1)
library(dplyr)
df1 %>%
mutate(
Output1 = Whole * if_else(code1 %in% c('a', 'b'), P_1, Q_1),
Output2 = Whole * if_else(code1 %in% c('a', 'b'), P_2, Q_2),
Output3 = Whole * if_else(code1 %in% c('a', 'b'), P_3, Q_3)
)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
# 1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
# 2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
# 3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
# 4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
# 5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
# 6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
如果您的数据更通用(不是硬编码或多于“ 3”组列),那么我们可以调整数据的形状,进行赋值然后将其重新成形。
library(tidyr)
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q)) %>%
pivot_wider(
id_cols = Whole,
names_from = set,
values_from = c(code, P, Q, Output),
names_sep = "_"
)
# # A tibble: 6 x 13
# Whole code_1 code_2 code_3 P_1 P_2 P_3 Q_1 Q_2 Q_3 Output_1 Output_2 Output_3
# <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 64 a b d 0.2 0.3 0.1 0.1 0.2 0.9 12.8 19.2 57.6
# 2 55 a c b 0.5 0.1 0.4 0.3 0.3 0.4 27.5 16.5 22
# 3 70 b d NULL 0.4 0.2 0.7 0.1 0.5 0.7 28 35 49
# 4 26 c a b 0.7 0.2 0.2 0.5 0.6 0.2 13 5.2 5.2
# 5 47 a d NULL 0.8 0.1 0.6 0.7 0.2 0.8 37.6 9.4 37.6
# 6 35 d b a 0.2 0.8 0.2 0.8 0.1 0.1 28 28 7
作为旁注,通常我建议将其保留为“长”格式,而不要重新扩展。对于其他类似整洁的函数(包括ggplot2
),通常首选这种“长”格式,并且很容易扩展为任意计数。这将导致这样的数据:
df1 %>%
rename_at(vars(starts_with("code")), ~ gsub("(\\D+)", "\\1_", .)) %>%
pivot_longer(
-Whole,
names_to = c(".value", "set"),
names_sep = "_"
) %>%
mutate(Output = Whole * if_else(code %in% c("a", "b"), P, Q))
# # A tibble: 18 x 6
# Whole set code P Q Output
# <int> <chr> <chr> <dbl> <dbl> <dbl>
# 1 64 1 a 0.2 0.1 12.8
# 2 64 2 b 0.3 0.2 19.2
# 3 64 3 d 0.1 0.9 57.6
# 4 55 1 a 0.5 0.3 27.5
# 5 55 2 c 0.1 0.3 16.5
# 6 55 3 b 0.4 0.4 22
# 7 70 1 b 0.4 0.1 28
# 8 70 2 d 0.2 0.5 35
# 9 70 3 NULL 0.7 0.7 49
# 10 26 1 c 0.7 0.5 13
# 11 26 2 a 0.2 0.6 5.2
# 12 26 3 b 0.2 0.2 5.2
# 13 47 1 a 0.8 0.7 37.6
# 14 47 2 d 0.1 0.2 9.4
# 15 47 3 NULL 0.6 0.8 37.6
# 16 35 1 d 0.2 0.8 28
# 17 35 2 b 0.8 0.1 28
# 18 35 3 a 0.2 0.1 7
(短得多。)
答案 1 :(得分:0)
我们可以使用map2
。获取具有“ P”,“ Q”的列的名称。 _
后跟数字。然后使用map2
遍历相应的列,应用转换逻辑并将这些列与原始数据集绑定
library(dplyr)
library(purrr)
library(stringr)
ps <- names(df1)[str_detect(names(df1), "^P_\\d+$")]
qs <- names(df1)[str_detect(names(df1), "^Q_\\d+$")]
map2_dfc(ps, qs, ~ df1 %>%
transmute(Output = Whole *
case_when(code1 %in% c('a', 'b') ~ !! rlang::sym(.x),
TRUE ~ !! rlang::sym(.y)))) %>%
rename_all(~ str_remove(., fixed("..."))) %>%
bind_cols(df1, .)
# Whole code1 P_1 Q_1 code2 P_2 Q_2 code3 P_3 Q_3 Output1 Output2 Output3
#1 64 a 0.2 0.1 b 0.3 0.2 d 0.1 0.9 12.8 19.2 6.4
#2 55 a 0.5 0.3 c 0.1 0.3 b 0.4 0.4 27.5 5.5 22.0
#3 70 b 0.4 0.1 d 0.2 0.5 NULL 0.7 0.7 28.0 14.0 49.0
#4 26 c 0.7 0.5 a 0.2 0.6 b 0.2 0.2 13.0 15.6 5.2
#5 47 a 0.8 0.7 d 0.1 0.2 NULL 0.6 0.8 37.6 4.7 28.2
#6 35 d 0.2 0.8 b 0.8 0.1 a 0.2 0.1 28.0 3.5 3.5
df1 <- structure(list(Whole = c(64L, 55L, 70L, 26L, 47L, 35L), code1 = c("a",
"a", "b", "c", "a", "d"), P_1 = c(0.2, 0.5, 0.4, 0.7, 0.8, 0.2
), Q_1 = c(0.1, 0.3, 0.1, 0.5, 0.7, 0.8), code2 = c("b", "c",
"d", "a", "d", "b"), P_2 = c(0.3, 0.1, 0.2, 0.2, 0.1, 0.8), Q_2 = c(0.2,
0.3, 0.5, 0.6, 0.2, 0.1), code3 = c("d", "b", "NULL", "b", "NULL",
"a"), P_3 = c(0.1, 0.4, 0.7, 0.2, 0.6, 0.2), Q_3 = c(0.9, 0.4,
0.7, 0.2, 0.8, 0.1)), class = "data.frame", row.names = c(NA,
-6L))
答案 2 :(得分:0)
根据您的行数,此data.table方法可能会更快。
library(data.table)
setDT(df1)[,Logical := (code1 == "a" | code1 == "b")][
,`:=`(Output1 = numeric(),Output2 = numeric(), Output3 = numeric())
][Logical == TRUE,`:=`(Output1 = Whole * P_1,
Output2 = Whole * P_2,
Output3 = Whole * P_3)
][Logical == FALSE,`:=`(Output1 = Whole * Q_1,
Output2 = Whole * Q_2,
Output3 = Whole * Q_3)
][,.(Output1,Output2,Output3)]
Output1 Output2 Output3
1: 12.8 19.2 6.4
2: 27.5 5.5 22.0
3: 28.0 14.0 49.0
4: 13.0 15.6 5.2
5: 37.6 4.7 28.2
6: 28.0 3.5 3.5