我有一个这样的数据框:
library(tidyverse)
df <- tibble(RANGER.MAE = c(0.22, 0.22, 0.21, 0.23, 0.20),
RANGER.RMSE = c(0.33, 0.48, 0.48, 0.48, 0.28),
QUANTILE.MAE = c(0.22, 0.21, 0.21, 0.22, 0.18),
QUANTILE.RMSE = c(0.35, 0.50, 0.49, 0.49, 0.29))
print(df)
RANGER.MAE RANGER.RMSE QUANTILE.MAE QUANTILE.RMSE
<dbl> <dbl> <dbl> <dbl>
1 0.22 0.33 0.22 0.35
2 0.22 0.48 0.21 0.5
3 0.21 0.48 0.21 0.49
4 0.23 0.48 0.22 0.49
5 0.2 0.28 0.18 0.29
我想在列中添加前缀(例如“ RANGER”和“ QUANTILE”),并在列中添加后缀(例如“ MAE”和“ RMSE”)。
我想要的输出是:
MAE RMSE
RANGER 0.22 0.33
RANGER 0.22 0.48
RANGER 0.21 0.48
RANGER 0.23 0.48
RANGER 0.20 0.48
QUANTILE 0.22 0.35
QUANTILE 0.21 0.50
QUANTILE 0.21 0.49
QUANTILE 0.22 0.49
QUANTILE 0.18 0.29
我尝试了gather(df)
,但我想我还需要一个参数来用后缀分隔列。我想在tidyverse中做到这一点。
答案 0 :(得分:2)
一种tidyverse
可能是:
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate(var, into = c("var", "col")) %>%
spread(col, val) %>%
select(-rowid)
var MAE RMSE
<chr> <dbl> <dbl>
1 QUANTILE 0.22 0.35
2 RANGER 0.22 0.33
3 QUANTILE 0.21 0.5
4 RANGER 0.22 0.48
5 QUANTILE 0.21 0.49
6 RANGER 0.21 0.48
7 QUANTILE 0.22 0.49
8 RANGER 0.23 0.48
9 QUANTILE 0.18 0.290
10 RANGER 0.2 0.28
答案 1 :(得分:1)
Het是data.table的一种方法,因此应在想要扔给它的任意大小的data.frame上运行 fast 。
我知道您要求tidyverse,但是data.table实际上(我认为可以) 工具最适合处理这种操作。基准证明了我的观点(我认为)。
library(data.table)
library(forcats)
#we are almost there with a one-liner data.table melt
ans <- melt( setDT(df), measure.vars = patterns( MAE = "MAE", RMSE = "RMSE") )
# variable MAE RMSE
# 1: 1 0.22 0.33
# 2: 1 0.22 0.48
# 3: 1 0.21 0.48
# 4: 1 0.23 0.48
# 5: 1 0.20 0.28
# 6: 2 0.22 0.35
# 7: 2 0.21 0.50
# 8: 2 0.21 0.49
# 9: 2 0.22 0.49
# 10: 2 0.18 0.29
#just set the factors of the variable to the desired character string
ans[, variable := forcats::lvls_revalue(variable, c("RANGER", "QUANTILE"))][]
# variable MAE RMSE
# 1: RANGER 0.22 0.33
# 2: RANGER 0.22 0.48
# 3: RANGER 0.21 0.48
# 4: RANGER 0.23 0.48
# 5: RANGER 0.20 0.28
# 6: QUANTILE 0.22 0.35
# 7: QUANTILE 0.21 0.50
# 8: QUANTILE 0.21 0.49
# 9: QUANTILE 0.22 0.49
# 10:QUANTILE 0.18 0.29
在100000行df上的一些基准测试
n = 100000
set.seed = 123
df <- tibble(RANGER.MAE = runif( n, 0, 1),
RANGER.RMSE = runif( n, 0, 1),
QUANTILE.MAE = runif( n, 0, 1),
QUANTILE.RMSE = runif( n, 0, 1))
microbenchmark::microbenchmark(
data.table = {
dt <- copy( df )
ans <- melt( setDT(dt), measure.vars = patterns( MAE = "MAE", RMSE = "RMSE") )
ans[, variable := forcats::lvls_revalue(variable, c("RANGER", "QUANTILE"))][]
},
tidyverse = {
dt <- copy( df )
df %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate(var, into = c("var", "col")) %>%
spread(col, val) %>%
select(-rowid)
}, times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table 2.050403 3.970056 3.82328 4.112553 4.423759 4.559629 5
# tidyverse 478.239258 563.140243 577.40732 572.687597 598.821034 674.148455 5