如何清理调查数据?

时间:2015-09-28 12:48:54

标签: r reshape tidyr

我应该如何继续(通过R以下列方式整理我的数据集:

输入

enter image description here

预期产出

enter image description here

利用tidyr的包

我正在考虑使用tidyr,但我无法弄清楚如何继续。任何提案?

数据

输入
input <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L), 
                               .Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", 
                                          "obs 4", "obs 5", "obs 6", "obs 7", "obs 8", "obs 9"), 
                               class = "factor"), 
                Proposal.1...first = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.1...second = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.1...last = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.2...first = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.2...second = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.2...last = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.3...first = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.3...second = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.3...last = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                              .Label = c("", "last"), class = "factor"), 
                Proposal.4...first = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), 
                                               .Label = c("", "first"), class = "factor"), 
                Proposal.4...second = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                                                .Label = c("", "second"), class = "factor"), 
                Proposal.4...last = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), 
                                              .Label = c("", "last"), class = "factor")), 
           .Names = c("ID", "Proposal.1...first", "Proposal.1...second", "Proposal.1...last", "Proposal.2...first", 
                      "Proposal.2...second", "Proposal.2...last", "Proposal.3...first","Proposal.3...second", 
                      "Proposal.3...last", "Proposal.4...first", "Proposal.4...second", "Proposal.4...last"), 
           class = "data.frame", 
           row.names = c(NA, -12L))
预期产出
output <- structure(list(ID = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 2L, 3L, 4L), 
                                    .Label = c("obs 1", "obs 10", "obs 11", "obs 12", "obs 2", "obs 3", "obs 4", "obs 5", 
                                               "obs 6", "obs 7", "obs 8", "obs 9"), class = "factor"), 
                     first = structure(c(1L, 1L, 2L, 4L, 2L, 3L, 2L, 4L, 1L, 1L, 4L, 2L), 
                                       .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"), 
                     second = structure(c(2L, 4L, 3L, 3L, 4L, 1L, 3L, 1L, 3L, 3L, 3L, 3L), 
                                        .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor"), 
                     last = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 4L, 2L, 4L, 4L, 2L, 4L), 
                                      .Label = c("Proposal 1", "Proposal 2", "Proposal 3", "Proposal 4"), class = "factor")), 
                .Names = c("ID", "first", "second", "last"), class = "data.frame", row.names = c(NA, -12L))

谢谢!

1 个答案:

答案 0 :(得分:7)

使用dplyr和tidyr,你可以使用聚合和传播的组合:

library(dplyr)
library(tidyr)

gather(input,proposal,value,-ID) %>% ## turn to long form
  mutate(proposal=sub("\\.{3}.*","",proposal)) %>% ## removes "- first|second|last" from proposal
  mutate(proposal=sub("\\."," ",proposal)) %>% ## Not needed, but cleaner: changes "." to " " in proposal
  filter(value != "") %>% ## removes lines with empty value
  spread(value,proposal) %>% ## turn to wide form
  select(ID,first,second,last) %>% ## Not needed, but cleaner: order columns
  arrange(as.numeric(sub("obs ","",ID))) ## Not needed, but cleaner: order rows

输出

       ID      first     second       last
1   obs 1 Proposal 1 Proposal 2 Proposal 3
2   obs 2 Proposal 1 Proposal 4 Proposal 3
3   obs 3 Proposal 2 Proposal 3 Proposal 1
4   obs 4 Proposal 4 Proposal 3 Proposal 1
5   obs 5 Proposal 2 Proposal 4 Proposal 1
6   obs 6 Proposal 3 Proposal 1 Proposal 2
7   obs 7 Proposal 2 Proposal 3 Proposal 4
8   obs 8 Proposal 4 Proposal 1 Proposal 2
9   obs 9 Proposal 1 Proposal 3 Proposal 4
10 obs 10 Proposal 1 Proposal 3 Proposal 4
11 obs 11 Proposal 4 Proposal 3 Proposal 2
12 obs 12 Proposal 2 Proposal 3 Proposal 4