Question

开始学习tidyr并遇到spread()的问题。

这是一个虚假的实验数据集：

library(tidyr)
df <- structure(list(mood = c(0.855, -0.103, 0.421, -0.222, 0.772, -0.027, -1.088, 0.923, -1.516, -1.503, -0.358, -0.357, -0.344, 0.294, 0.348, -0.174, 0.872, -1.188, 0.842, -0.246, -0.758, 0.674, 0.045, 0.72, -1.253, 0.00599999999999995, -0.0749999999999999,1.623, -1.754, -0.44, -0.607, -0.083, -0.827, -0.337, -0.6, 0.429, -0.383, -1.755, 0.894, 0.146, -0.658, -0.409, -0.531, 1.388, -0.688, 0.521, -0.662, 0.852, -1.363, 0.18, -0.775, 0.393, -0.926, 0.809, -0.857, 0.889, 0.0969999999999999, -1.553, -0.21,1.769, -0.114, -0.203, 0.805, 0.186, 0.286, -0.076, 0.137, 1.208, 0.33, 0.34, 0.832, 0.815, -0.427, 0.444, -0.838, 1.45, 1.701, -2.265, 0.531, 0.808),
  subj = structure(c(1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L,6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label = c("s1", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s2", "s20", "s3", "s4", "s5", "s6", "s7", "s8", "s9"), class = "factor"),
  depressed = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), 
  activity = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("exercize", "relaxation"), class = "factor"), 
  drug = structure(c(1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("placebo", "SSRI"), class = "factor")), .Names = c("mood", "subj", "depressed", "activity", "drug"), row.names = c(NA, -80L), class = "data.frame")

看一看df，我们可以看到activity和drug是受试者内因素，而depressed是主体间因素：

head(arrange(df, subj))
    mood subj depressed   activity    drug
1  0.855   s1       yes relaxation placebo
2 -0.758   s1       yes relaxation    SSRI
3 -0.658   s1       yes   exercize placebo
4 -0.114   s1       yes   exercize    SSRI
5 -1.503  s10        no relaxation placebo
6 -0.440  s10        no relaxation    SSRI

我想将df转换为宽格式，其中每个主题的mood都在一个单独的列中表示，每行对应activity和drug的组合。不幸的是，我无法弄清楚如何避免为depressed的每个级别创建行集。这是我到目前为止所做的：

df %>% spread(subj, mood) %>% `[`(1:5)
  depressed   activity    drug     s1   s10
1        no   exercize placebo     NA  0.18
2        no   exercize    SSRI     NA  0.34
3        no relaxation placebo     NA -1.50
4        no relaxation    SSRI     NA -0.44
5       yes   exercize placebo -0.658    NA
6       yes   exercize    SSRI -0.114    NA
7       yes relaxation placebo  0.855    NA
8       yes relaxation    SSRI -0.758    NA

我想避免扩展depressed，因此结果输出只有四行。

Answer 1

感谢@dendndodiscimus在评论中提供解决方案。

消除＆＃34;广泛＆＃34;中的主题因素。格式是使用dplyr::select和-的简单问题。

> library(dplyr)
> df <- tbl_df(df)
> df %>% select(-depressed) %>% spread(subj, mood)
Source: local data frame [4 x 22]

    activity    drug     s1    s10    s11    s12    s13    s14    s15    s16
1   exercize placebo -0.658  0.180 -0.775  0.393 -0.926  0.809 -0.857  0.889
2   exercize    SSRI -0.114  0.340  0.832  0.815 -0.427  0.444 -0.838  1.450
3 relaxation placebo  0.855 -1.503 -0.358 -0.357 -0.344  0.294  0.348 -0.174
4 relaxation    SSRI -0.758 -0.440 -0.607 -0.083 -0.827 -0.337 -0.600  0.429

tidyr：传播而不扩展所有列

1 个答案: