Question

我有一些数据结构如下：

structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("group1", "group2"), class = "factor"), measurement = c("color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time", "color", "time"), item_pos = c("1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4", "1", "1", "2", "2", "3", "3", "4", "4"), value = c("blue", "1508", "orange", "752", "black", "585", "red", "842", "red", "879", "white", "1455", "green", "1757", "orange", "2241", "white", "2251", "yellow", "1740", "red", "1962", "yellow", "1854", "green", "1859", "blue", "2156", "yellow", "2494", "green", "1757"), item = c("A", "A", "B", "B", "B", "B", "A", "A", "A", "A", "B", "B", "B", "B", "A", "A", "C", "C", "C", "C", "D", "D", "D", "D", "C", "C", "C", "C", "D", "D", "D", "D")), .Names = c("subject", "group", "measurement", "item_pos", "value", "item"), row.names = c(NA, -32L), class = "data.frame")

按项目按主题进行多次观察，因此主题1的数据如下所示：

> filter(df.tidy, subject==1)
  subject  group measurement item_pos  value item
1       1 group1       color        1   blue    A
2       1 group1        time        1   1508    A
3       1 group1       color        2 orange    B
4       1 group1        time        2    752    B
5       1 group1       color        3  black    B
6       1 group1        time        3    585    B
7       1 group1       color        4    red    A
8       1 group1        time        4    842    A

因此，在group内，每个item出现两次，并且每次出现时都会有measurement的颜色和时间。项目显示的顺序位于item_pos。

虽然我喜欢这种长格式，但是同事需要稍微宽泛一点，并且按照项目在自己的列中重复使用颜色和时间。 所需格式如下：

  subject  group item color1 color2 time1 time2
        1 group1    A   blue    red  1508   842
        1 group1    B orange  black   752   585
...
        4 group2    D yellow  green  2494  1757

我的感觉是，应该可以使用gather()，spread()和其他dplyr动词的组合，但我不确定这里的dplyr等价物是什么（在for-loop speak）按组循环遍历项目并收集后续列中的颜色和时间观察。非常感谢！

我咨询了相关问题：

Answer 1

我们可以从dcast尝试library(data.table)。转换＆＃39; data.frame＆＃39;到＆＃39; data.table＆＃39; （setDT(df.tidy)，按＆＃39;主题＆＃39;，＆＃39;测量＆＃39;和＆＃39;项目＆＃39;分组，创建序列列＆＃34; N＆＃34;然后使用dcast将“长”格式转换为“格式”格式。

library(data.table)
setDT(df.tidy)[, N:=1:.N, by = .(subject, measurement, item)]
dcast(df.tidy, subject+group + item ~measurement + N, value.var="value", sep="")
#   subject  group item color1 color2 time1 time2
#1:       1 group1    A   blue    red  1508   842
#2:       1 group1    B orange  black   752   585
#3:       2 group1    A    red orange   879  2241
#4:       2 group1    B  white  green  1455  1757
#5:       3 group2    C  white yellow  2251  1740
#6:       3 group2    D    red yellow  1962  1854
#7:       4 group2    C  green   blue  1859  2156
#8:       4 group2    D yellow  green  2494  1757

或者使用dplyr/tidyr，我们按同一列分组，创建序列列（＆＃34; N＆＃34;），ungroup，粘贴＆＃39;测量＆＃39;和＆＃39; N＆＃39;用于创建测量N＆＃39;的列（使用unite）然后spread将数据提供给广泛的＆＃39;格式。

library(dplyr)
library(tidyr)
df.tidy %>%
    group_by(subject, measurement, item) %>% 
    mutate(N = row_number()) %>%
    ungroup() %>% 
    unite(measurementN, measurement, N, sep='') %>%
    select(-item_pos) %>% 
    spread(measurementN, value)
#  subject  group  item color1 color2 time1 time2
#    (int) (fctr) (chr)  (chr)  (chr) (chr) (chr)
#1       1 group1     A   blue    red  1508   842
#2       1 group1     B orange  black   752   585
#3       2 group1     A    red orange   879  2241
#4       2 group1     B  white  green  1455  1757
#5       3 group2     C  white yellow  2251  1740
#6       3 group2     D    red yellow  1962  1854
#7       4 group2     C  green   blue  1859  2156
#8       4 group2     D yellow  green  2494  1757

通过重复措施部分传播整洁的数据

1 个答案: