Question

我正在使用excel电子表格中的一些活动前和活动后数据，其中一个问题可能有多个答案（并且学生必须选择所有正确的答案以获得该问题的完全信用），但数据的方式最初格式化，每个单独的响应列在自己的行中，其他元数据（数据库信息，学生用户名，性别等）只是为每一行复制 - 我将通过下面的示例详细说明。

>data
Database   Username   Gender   InterviewType   Question                    Answer
1          steve      Male     Pre             Which of the following...   "Response 1"
1          steve      Male     Pre             Which of the following...   "Response 2"
1          steve      Male     Pre             Which of the following...   "Response 3"
1          steve      Male     Pre             Please indicate your race.  "White"
1          steve      Male     Pre             Explain how you would...    "Response"
2          jenna      Female   Pre             Which of the following...   "Response 1"
2          jenna      Female   Pre             Which of the following...   "Response 2"
2          jenna      Female   Pre             Please indicate your race.  "White"
2          jenna      Female   Pre             Explain how you would...    "Response"
1          jack       Male     Pre             Which of the following...   "Response 1"
1          jack       Male     Pre             Which of the following...   "Response 2"
1          jack       Male     Pre             Which of the following...   "Response 3"
1          jack       Male     Pre             Please indicate your race.  "Black"
1          jack       Male     Pre             Explain how you would...    "Response"
3          billy      Male     Pre             Which of the following...   "Response 1"
3          billy      Male     Pre             Which of the following...   "Response 2"
3          billy      Male     Pre             Please indicate your race.  "Other"
3          billy      Male     Pre             Explain how you would...    "Response"

当我对数据进行格式化时，数据更有用，因此每个单独的问题在数据框中都有自己的列，因此，通过协作，我们编写了以下代码，将每个独特的问题转换为自己的列：

x = data
require(stringr)
temp = which(x$Db == "Db")
x = x[-temp,]
questions = unique(x$Question)
concdMeta = apply(x[,c('Db', 'username', 'gender', 'interviewForm')],
                  MARGIN = 1,
                  FUN = paste,
                  collapse = " & ")
students = unique(concdMeta)

out = matrix(nrow = length(students),
         ncol = 4 + length(questions))
row.names(out) = students
colnames(out) = c(colnames(x)[1:4], questions)
out = data.frame(out)

for(i in 1:nrow(out)) {
  Z = str_split(row.names(out)[i], ' & ')[[1]]
  for(j in 1:4) {
    out[i, j] = Z[j]
  }
}


for (i in 1:nrow(x))
{
  db = x$Db[i]
  un = x$username[i]
  g = x$gender[i]
  iF = x$interviewForm[i]
  q = x$Question[i]
  a = x$Answer[i]

  this.meta = paste(x[i, 1:4], collapse = " & ")
  matching.row = row.names(out) == this.meta

  out[matching.row, 4 + which(questions == q)] = a

}

out[is.na(out)] = "NA"

上面的代码非常有效，除了我们遇到的一个问题：它列出了对问题的最终回答，无论最后一个回答是针对该学生的。因此，对于，比如史蒂夫上面，它只会列出“响应3”或詹纳的“响应2”等。如此：

>data.reformatted
Database   Username   Gender   InterviewType   Which of the following...  Please indicate...  Explain how...
1          steve      Male     Pre             "Response 3"               "White"             "Response"
2          jenna      Female   Pre             "Response 2"               "White"             "Response"              
1          jack       Male     Pre             "Response 3"               "Black"             "Response"              
3          billy      Male     Pre             "Response 2"               "Other"             "Response"

我们可以对上面的代码进行添加以使其将所有响应连接到单个单元格中吗？意味着最终数据集看起来像这样：

Database   Username   Gender   InterviewType   Which of the following...             Please indicate...   Explain how...
1          steve      Male     Pre             "Response 1, Response 2, Response 3"  "White"             "Response"
2          jenna      Female   Pre             "Response 1, Response 2"              "White"             "Response"
1          jack       Male     Pre             "Response 1, Response 2, Response 3"  "Black"             "Response"
3          billy      Male     Pre             "Response 1, Response 2"              "Other"             "Response"

Answer 1

除了上面的答案，更自动化的版本将是：

library(dplyr)
library(tidyr)


data$rownum <- 1:nrow(data) # add row numbers as a column 
# needed for the spread function to work 
# (complains about duplicates otherwise)

questions <- as.character(unique(data$Question))
qNames <- paste0("q", 1:length(questions))

data <- data.frame(lapply(data, as.character), stringsAsFactors = FALSE)

# change questions names to question keys
for (q in 1:length(questions)){
  data[data$Question == questions[q], "Question"] <- qNames[q]
}

data.wide <- data %>% 
  spread(Question, Answer)

#colnames(data.wide)[6:8] <- c("Explain", "Indicate", "Which") # change column names
data.wide <- data.frame(lapply(data.wide, as.character), stringsAsFactors = FALSE)
data.wide[is.na(data.wide)] <- ""
head(data.wide)



# Group by necessary columns and merge rows using Reduce and paste    
dummy <- data.wide %>%
  dplyr::group_by(Database, Username, Gender, InterviewType) 


# Create command as a string
str_start <- "dummy2 <- dummy %>% dplyr::summarize("
str_end <- paste0(qNames[length(qNames)]," = Reduce(function(...) paste(...),", qNames[length(qNames)],"))")

str_middle <- ""                   
for (q in 1:(length(qNames) - 1 )){
  str_middle <- paste(str_middle, qNames[q] ," = Reduce(function(...) paste(...),", qNames[q], "),")

}                   

str_final <- paste0(str_start, str_middle, str_end)

# Execute Command
eval(parse(text = str_final))

# Change columns names
for (q in 1:length(questions)){
   colnames(dummy2)[colnames(dummy2) == qNames[q]] <- as.character(questions[q])
}

Answer 2

我已经把一些似乎做你想做的代码放在一起了。您需要安装dplyr才能运行此功能。

data$rownum <- 1:nrow(data) # add row numbers as a column 
                            # needed for the spread function to work 
                            # (complains about duplicates otherwise)
data.wide <- data %>% 
    spread(Question, Answer)

colnames(data.wide)[6:8] <- c("Explain", "Indicate", "Which") # change column names
data.wide[is.na(data.wide)] <- ""
head(data.wide)
  Database Username Gender InterviewType rownum    Explain Indicate        Which
1        1     jack   Male           Pre     10                     "Response 1"
2        1     jack   Male           Pre     11                     "Response 2"
3        1     jack   Male           Pre     12                     "Response 3"
4        1     jack   Male           Pre     13             "Black"             
5        1     jack   Male           Pre     14 "Response"                      
6        1    steve   Male           Pre      1                     "Response 1


# Group by necessary columns and merge rows using Reduce and paste    
result <- data.wide %>%
  dplyr::group_by(Database, Username, Gender, InterviewType) %>%
  dplyr::summarize(
    Which = Reduce(function(...) paste(...), Which),
    Indicate = Reduce(function(...) paste(...), Indicate),
    Explain = Reduce(function(...) paste(...), Explain))

head(as.data.frame(result))
  Database Username Gender InterviewType                                    Which    Indicate        Explain
1        1     jack   Male           Pre "Response 1" "Response 2" "Response 3"      "Black"      "Response"
2        1    steve   Male           Pre "Response 1" "Response 2" "Response 3"      "White"      "Response"
3        2    jenna Female           Pre              "Response 1" "Response 2"      "White"      "Response"
4        3    billy   Male           Pre              "Response 1" "Response 2"      "Other"      "Response"

希望这会有所帮助。

有没有办法将多个数据框条目连接成一个条目？

2 个答案: