在这种转换中,我需要帮助。
我有一个长数据帧,其中包含模拟数据(即10k模拟)以及其他功能。
我想将此长数据帧转换为宽数据帧,但希望将每个模拟放在单独的行中。
我尝试了默认的长到宽转换方法,但是它们使每个模拟成为一个新功能(但是我希望每个模拟都在新行中)
虚拟数据集:
name1 <- c('frame1','frame1','frame1','frame1','frame1','frame1','frame1','frame1','frmae1','frame1',
'frame2','frame2','frame2','frame2','frame2','frame2','frame2','frame2','frmae2','frame2')
name2 <- c('arch','conf','f.time.c','f.rev.c','sim1.f','sim1.val','sim2.f','¨sim2.val','sim3.f','sim3-val',
'arch','conf','f.time.c','f.rev.c','sim1.f','sim1.val','sim2.f','¨sim2.val','sim3.f','sim3.val')
value <- c('x86','cust','100','20','-a -b -c=10','150','-d -e=1 -f -z -x','40','-r -e -f -r','89',
'x24','default','500','2','-a1 -b34 -c=12','99','-a -e','100','-e -z ','120')
df = data.frame(name1,name2,value)
数据集看起来像这样:
name1 name2 value
1 frame1 arch x86
2 frame1 conf cust
3 frame1 f.time.c 100
4 frame1 f.rev.c 20
5 frame1 sim1.f -a -b -c=10
6 frame1 sim1.val 150
7 frame1 sim2.f -d -e=1 -f -z -x
8 frame1 sim2.val 40
9 frmae1 sim3.f -r -e -f -r
10 frame1 sim3.val 89
-------------- Simulation for frame 1 end here
11 frame2 arch x24
12 frame2 conf default
13 frame2 f.time.c 500
14 frame2 f.rev.c 2
15 frame2 sim1.f -a1 -b34 -c=12
16 frame2 sim1.val 99
17 frame2 sim2.f -a -e
18 frame2 sim2.val 100
19 frmae2 sim3.f -e -z
20 frame2 sim3.val 120
我想像这样将其转换为数据帧:(具有针对不同sim卡的功能重复)
frame arch conf f.time.c f.rev.c sim_number sim.f vale
1 frame1 x86 cust 100 20 sim1 -a -b -c=10 150
2 frame1 x86 cust 100 20 sim2 -d -e=1 -f -z -x 40
3 frame1 x86 cust 100 20 sim3 -r -e -f -r 89
4 frame2 x24 default 500 2 sim1 -a1 -b34 -c=12 99
5 frame2 x24 default 500 2 sim2 -a -e 100
6 frame2 x24 default 500 2 sim3 -e -z 120
答案 0 :(得分:3)
您可以结合使用data.table的dcast
和melt
函数。
name2
列,以基于value
列的值将其转换为宽格式。sim
开头的列(例如sim1.f
和sim1.val
)重新融合为更长的格式library(data.table)
## cast to wide format
df_wide <- dcast(setDT(df), name1 ~ name2, value.var = "value")
## melt back a selection of columns
melt(df_wide,
measure.vars = patterns("\\.f$", "\\.val$"),
variable.name = "sim_number",
value.name = c("sim.f", "value")
)
#> name1 arch conf f.rev.c f.time.c sim_number sim.f value
#> 1: frame1 x86 cust 20 100 1 -a -b -c=10 150
#> 2: frame2 x24 default 2 500 1 -a1 -b34 -c=12 99
#> 3: frame1 x86 cust 20 100 2 -d -e=1 -f -z -x 40
#> 4: frame2 x24 default 2 500 2 -a -e 100
#> 5: frame1 x86 cust 20 100 3 -r -e -f -r 89
#> 6: frame2 x24 default 2 500 3 -e -z 120
数据
df <- structure(list(name1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("frame1",
"frame2"), class = "factor"), name2 = structure(c(1L, 2L, 4L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 4L, 3L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("arch", "conf", "f.rev.c", "f.time.c", "sim1.f",
"sim1.val", "sim2.f", "sim2.val", "sim3.f", "sim3.val"), class = "factor"),
value = structure(c(19L, 16L, 7L, 11L, 1L, 9L, 4L, 12L, 6L,
14L, 18L, 17L, 13L, 10L, 3L, 15L, 2L, 7L, 5L, 8L), .Label = c("-a -b -c=10",
"-a -e", "-a1 -b34 -c=12", "-d -e=1 -f -z -x", "-e -z ",
"-r -e -f -r", "100", "120", "150", "2", "20", "40", "500",
"89", "99", "cust", "default", "x24", "x86"), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
NB:在原始数据中,我分别用'sim2.val'和'frmae2'和'frmae1'分别用'frame2'和'frame1'替换了'¨sim2.val'。
答案 1 :(得分:2)
一种可能(但不是很精简)的tidyverse
方法:
library(tidyverse)
df %>%
mutate(
name1 = gsub("frmae", "frame", name1),
sim_number = gsub("^.*(sim\\d+).*", "\\1", name2),
name2 = gsub("^sim.*f$", "sim.f", name2),
name2 = gsub("^.*sim\\d+.*val$", "sim.val", name2)
) %>%
spread(name2, value) %>%
group_by(name1) %>%
fill(names(.)) %>% fill(names(.), .direction = 'up') %>%
filter(grepl("sim", sim_number))
给予:
# A tibble: 6 x 8
# Groups: name1 [2]
name1 sim_number arch conf f.rev.c f.time.c sim.f sim.val
<chr> <chr> <fct> <fct> <fct> <fct> <fct> <fct>
1 frame1 sim1 x86 cust 20 100 -a -b -c=10 150
2 frame1 sim2 x86 cust 20 100 -d -e=1 -f -z -x 40
3 frame1 sim3 x86 cust 20 100 -r -e -f -r 89
4 frame2 sim1 x24 default 2 500 -a1 -b34 -c=12 99
5 frame2 sim2 x24 default 2 500 -a -e 100
6 frame2 sim3 x24 default 2 500 "-e -z " 120