开始学习tidyr
并遇到spread()
的问题。
这是一个虚假的实验数据集:
library(tidyr)
df <- structure(list(mood = c(0.855, -0.103, 0.421, -0.222, 0.772, -0.027, -1.088, 0.923, -1.516, -1.503, -0.358, -0.357, -0.344, 0.294, 0.348, -0.174, 0.872, -1.188, 0.842, -0.246, -0.758, 0.674, 0.045, 0.72, -1.253, 0.00599999999999995, -0.0749999999999999,1.623, -1.754, -0.44, -0.607, -0.083, -0.827, -0.337, -0.6, 0.429, -0.383, -1.755, 0.894, 0.146, -0.658, -0.409, -0.531, 1.388, -0.688, 0.521, -0.662, 0.852, -1.363, 0.18, -0.775, 0.393, -0.926, 0.809, -0.857, 0.889, 0.0969999999999999, -1.553, -0.21,1.769, -0.114, -0.203, 0.805, 0.186, 0.286, -0.076, 0.137, 1.208, 0.33, 0.34, 0.832, 0.815, -0.427, 0.444, -0.838, 1.45, 1.701, -2.265, 0.531, 0.808),
subj = structure(c(1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 1L, 12L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 2L, 3L, 4L, 5L,6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label = c("s1", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s2", "s20", "s3", "s4", "s5", "s6", "s7", "s8", "s9"), class = "factor"),
depressed = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"),
activity = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("exercize", "relaxation"), class = "factor"),
drug = structure(c(1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("placebo", "SSRI"), class = "factor")), .Names = c("mood", "subj", "depressed", "activity", "drug"), row.names = c(NA, -80L), class = "data.frame")
看一看df
,我们可以看到activity
和drug
是受试者内因素,而depressed
是主体间因素:
head(arrange(df, subj))
mood subj depressed activity drug
1 0.855 s1 yes relaxation placebo
2 -0.758 s1 yes relaxation SSRI
3 -0.658 s1 yes exercize placebo
4 -0.114 s1 yes exercize SSRI
5 -1.503 s10 no relaxation placebo
6 -0.440 s10 no relaxation SSRI
我想将df
转换为宽格式,其中每个主题的mood
都在一个单独的列中表示,每行对应activity
和drug
的组合。不幸的是,我无法弄清楚如何避免为depressed
的每个级别创建行集。这是我到目前为止所做的:
df %>% spread(subj, mood) %>% `[`(1:5)
depressed activity drug s1 s10
1 no exercize placebo NA 0.18
2 no exercize SSRI NA 0.34
3 no relaxation placebo NA -1.50
4 no relaxation SSRI NA -0.44
5 yes exercize placebo -0.658 NA
6 yes exercize SSRI -0.114 NA
7 yes relaxation placebo 0.855 NA
8 yes relaxation SSRI -0.758 NA
我想避免扩展depressed
,因此结果输出只有四行。
答案 0 :(得分:1)
感谢@dendndodiscimus在评论中提供解决方案。
消除&#34;广泛&#34;中的主题因素。格式是使用dplyr::select
和-
的简单问题。
> library(dplyr)
> df <- tbl_df(df)
> df %>% select(-depressed) %>% spread(subj, mood)
Source: local data frame [4 x 22]
activity drug s1 s10 s11 s12 s13 s14 s15 s16
1 exercize placebo -0.658 0.180 -0.775 0.393 -0.926 0.809 -0.857 0.889
2 exercize SSRI -0.114 0.340 0.832 0.815 -0.427 0.444 -0.838 1.450
3 relaxation placebo 0.855 -1.503 -0.358 -0.357 -0.344 0.294 0.348 -0.174
4 relaxation SSRI -0.758 -0.440 -0.607 -0.083 -0.827 -0.337 -0.600 0.429