我经常发现自己的情况是我有一个包含多组宽列的表,如下所示:
replicate groupA VA1 VA2 groupB VB1 VB2
1 1 a 0.3429166 -2.30336406 f 0.05363582 1.6454078
2 2 b -1.3183732 -0.13516849 g -0.42586417 0.1541541
3 3 c -0.7908358 -0.10746447 h 1.05134242 1.4297350
4 4 d -0.9963677 -1.82557058 i -1.14532536 1.0815733
5 5 e -1.3634609 0.04385812 j -0.65643595 -0.1452877
我想将列变成一个长表,如下所示:
replicate group key value
1 1 a V1 0.34291665
2 2 b V1 -1.31837322
3 3 c V1 -0.79083580
4 4 d V1 -0.99636772
5 5 e V1 -1.36346088
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540784
17 2 g V2 0.15415408
18 3 h V2 1.42973499
19 4 i V2 1.08157329
20 5 j V2 -0.14528774
我可以通过单独选择两组列,整理然后再组合(下面的代码)来完成此操作。然而,这种方法似乎并不特别优雅,如果有两组以上的列,则会变得很麻烦。我想知道使用单一数据转换管道是否有更优雅的方法。
这里的基本问题是:我们如何自动化将表分成列组,整理列,然后重新组合在一起的过程。
我目前的代码:
library(dplyr)
library(tidyr)
# generate example code
df_wide <- data.frame(replicate = 1:5,
groupA = letters[1:5],
VA1 = rnorm(5),
VA2 = rnorm(5),
groupB = letters[6:10],
VB1 = rnorm(5),
VB2 = rnorm(5))
# tidy columns with A in the name
dfA <- select(df_wide, replicate, groupA, VA1, VA2) %>%
gather(key, value, VA1, VA2) %>%
mutate(key = case_when(key == "VA1" ~ "V1",
key == "VA2" ~ "V2")) %>%
select(replicate, group = groupA, key, value)
# tidy columns with B in the name
dfB <- select(df_wide, replicate, groupB, VB1, VB2) %>%
gather(key, value, VB1, VB2) %>%
mutate(key = case_when(key == "VB1" ~ "V1",
key == "VB2" ~ "V2")) %>%
select(replicate, group = groupB, key, value)
# combine
df_long <- rbind(dfA, dfB)
答案 0 :(得分:3)
<强> 1 强>
虽然问题是tidyverse
解决方案,但melt
的{{1}}有一个方便的选项,data.table
也可以使用patterns
}参数。
measure
<强> 2。一个强>
使用library(data.table)
setnames(melt(melt(setDT(df1), measure = patterns('group', 'VA', 'VB')),
id.var = 1:3)[, -4, with = FALSE], 2:3, c('key', 'group'))[]
我们可以将数据集子集化为tidyverse
,然后使用list
循环遍历list
,并使用map_df
将其转换为“long”格式获得单个data.frame
gather
<强> 2.B 强>
如果我们需要library(tidyverse)
list(df1[1:4], df1[c(1,5:7)]) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
# replicate group key value
#1 1 a V1 0.34291660
#2 2 b V1 -1.31837320
#3 3 c V1 -0.79083580
#4 4 d V1 -0.99636770
#5 5 e V1 -1.36346090
#6 1 a V2 -2.30336406
#7 2 b V2 -0.13516849
#8 3 c V2 -0.10746447
#9 4 d V2 -1.82557058
#10 5 e V2 0.04385812
#11 1 f V1 0.05363582
#12 2 g V1 -0.42586417
#13 3 h V1 1.05134242
#14 4 i V1 -1.14532536
#15 5 j V1 -0.65643595
#16 1 f V2 1.64540780
#17 2 g V2 0.15415410
#18 3 h V2 1.42973500
#19 4 i V2 1.08157330
#20 5 j V2 -0.14528770
根据“群组”的出现
split
<强> 2.C 强>
以split.default(df1[-1], cumsum(grepl('group', names(df1)[-1]))) %>%
map(~bind_cols(df1[1], .)) %>%
map_df(~gather(., key, value, 3:4) %>%
{names(.)[2] <- 'group';.}) %>%
mutate(key = sub('(.).(.)', '\\1\\2', key))
选项的精神包含rename_at
而不是names
作业
tidyverse
注意:
1)df1[-1] %>%
split.default(cumsum(grepl('group', names(df1)[-1]))) %>%
map_df(~bind_cols(df1[1], .) %>%
gather(., key, value, 3:4) %>%
rename_at(2, funs(substring(.,1, 5))))
,2.a
,2.b
都使用了tidyverse函数
2)它不依赖于列名
中的子串'A'或'B'3)假设OP数据集中的模式为“group”,后跟值列
答案 1 :(得分:1)
1)此解决方案包含:
首先收集名称以V开头的列,然后在groupA和groupB中创建一个新的group列,如果键中有A,则选择groupA,如果键中有B,则为groupB。 (我们在这里使用了mapply(switch,...)以便轻松扩展到3+组案例,但我们可以使用ifelse,即ifelse(grepl(“A”,key),as.character(groupA),as .character(groupB)),因为我们只有两个组。)mutate还将键名从VA1减少到V1等,最后选出所需的列。
DF %>%
gather(key, value, starts_with("V")) %>%
mutate(group = mapply(switch, gsub("[^AB]", "", key), A = groupA, B = groupB),
key = sub("[AB]", "", key)) %>%
select(replicate, group, key, value)
,并提供:
replicate group key value
1 1 a V1 0.34291660
2 2 b V1 -1.31837320
3 3 c V1 -0.79083580
4 4 d V1 -0.99636770
5 5 e V1 -1.36346090
6 1 a V2 -2.30336406
7 2 b V2 -0.13516849
8 3 c V2 -0.10746447
9 4 d V2 -1.82557058
10 5 e V2 0.04385812
11 1 f V1 0.05363582
12 2 g V1 -0.42586417
13 3 h V1 1.05134242
14 4 i V1 -1.14532536
15 5 j V1 -0.65643595
16 1 f V2 1.64540780
17 2 g V2 0.15415410
18 3 h V2 1.42973500
19 4 i V2 1.08157330
20 5 j V2 -0.14528770
2)另一种方法是将列拆分为组,以便在从名称中删除A和B后,组中的所有列具有相同的名称。 Performi在每个这样的组上取消列表,将列表缩减为普通向量列表,并将该列表转换为data.frame。最后收集V列并重新排列。请注意,rownames_to_column来自tibble包。
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
rownames_to_column %>%
gather(key, value, starts_with("V")) %>%
arrange(gsub("[^AB]", "", rowname), key) %>%
select(replicate, group, key, value)
2a)如果行顺序不重要,则可以省略rownames_to_column,排列和选择行,将其缩短为:
DF %>%
as.list %>%
split(sub("[AB]", "", names(.))) %>%
lapply(unlist) %>%
as.data.frame %>%
gather(key, value, starts_with("V"))
解决方案(2)和(2a)可以很容易地转换为仅基础解决方案,通过用基础中的适当的重塑形式替换聚集,如第二个重塑,即(3)中产生d2的那个。
3)虽然问题需要一个整数解决方案,但有一个相当方便的基本解决方案,包括两个重塑调用。拆分产生的变化是:list(group = c("groupA", "groupB"), V1 = c("VA1", "VB1"), V2 = c("VA2", "VB2"))
- 即它匹配每组列中的第i列。
varying <- split(names(DF)[-1], gsub("[AB]", "", names(DF))[-1])
d <- reshape(DF, dir = "long", varying = varying, v.names = names(varying))
d <- subset(d, select = -c(time, id))
d2 <- reshape(d, dir = "long", varying = list(grep("V", names(d))), v.names = "value",
timevar = "key")
d2 <- subset(d2, select = c(replication, group, key, value))
d2
注意:可重复形式的输入是:
DF <- structure(list(replicate = 1:5, groupA = structure(1:5, .Label = c("a",
"b", "c", "d", "e"), class = "factor"), VA1 = c(0.3429166, -1.3183732,
-0.7908358, -0.9963677, -1.3634609), VA2 = c(-2.30336406, -0.13516849,
-0.10746447, -1.82557058, 0.04385812), groupB = structure(1:5, .Label = c("f",
"g", "h", "i", "j"), class = "factor"), VB1 = c(0.05363582, -0.42586417,
1.05134242, -1.14532536, -0.65643595), VB2 = c(1.6454078, 0.1541541,
1.429735, 1.0815733, -0.1452877)), .Names = c("replicate", "groupA",
"VA1", "VA2", "groupB", "VB1", "VB2"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))