Question

我需要将一个宽数据集转换为长数据集，并且有16列必须收敛到4列。每4列包含彼此相关的信息，并且该信息在转换中一定不能“丢失”。

我有来自四个区块的排名任务的数据，该任务实质上给了我一个数据集，其中信息以较宽的格式分为四个组。即first_image，first_sex，first_score，second_image，second_sex，second_score ...

我尝试了group_by和collect（）的各种组合，但距离还很遥远。

我已经读过Reshaping multiple sets of measurement columns (wide format) into single columns (long format)，但我恐怕没有一个聪明的人。

我已经制作了一些参与者数据的样例数据，并且还样例了我希望数据如何显示。


library(tidyverse)

sample_dat <- data.frame(subject_id = rep("sj1", 4),
                         first_pick = rep(1, 4),
                         first_image_pick = (c("a", "b", "c", "d")),
                         first_pick_neuro = rep("TD", 4),
                         first_pick_sex = rep("F", 4),
                         second_pick = rep(2, 4),
                         second_image_pick = (c("e", "f", "g", "h")),
                         second_pick_neuro = rep("TD", 4),
                         second_pick_sex = rep("M", 4),
                         third_pick = rep(3, 4),
                         third_image_pick = (c("i", "j", "k", "l")),
                         third_pick_neuro = rep("DS", 4),
                         third_pick_sex = rep("F", 4),
                         fourth_pick = rep(4, 4),
                         fourth_image_pick = (c("m", "n", "o", "p")),
                         fourth_pick_neuro = rep("DS", 4),
                         fourth_pick_sex = rep("M", 4))

预期输出：


final_data <- data.frame(subject_id = rep("sj1", 16),
                         image = c("a", "b", "c", "d",
                                   "e", "f", "g", "h",
                                   "i", "j", "k", "l",
                                   "m", "n", "o", "p"),
                         rank = rep(c(1, 2, 3, 4), each = 4), # from the numbers in the first_pick, second_pick etc. 
                         neuro = rep(c("TD", "DS"), each = 8),
                         sex = rep(c("F", "M", "F", "M"), each = 4))

到目前为止，我已经尝试过了，但是它只复制了所有信息：


sample_dat_long <- sample_dat %>%
  group_by(subject_id) %>%
  gather(Pick, Image,
         first_image_pick,
         second_image_pick,
         third_image_pick,
         fourth_image_pick)

因此，从本质上讲，我不想在收集数据时丢失每个图像（拾取，性别，神经）的信息。

任何帮助都会很棒！

Answer 1

我们可以使用melt中的data.table来完成此操作，这可能需要多个measure patterns从“宽”格式改成“长”格式。在这里，带有子字符串'image'，'neuro'，'sex'的列名称被重塑为单独的列，以获得预期的输出

library(data.table)
melt(setDT(sample_dat), measure = patterns("image", "neuro", "sex"), 
   value.name = c("image", "neuro", "sex"), variable.name = 'rank')[, 
    .(subject_id, rank, image, neuro, sex)]

Answer 2

我想您可以逐列进行操作，因为最后只需要4列。获取应该放在第一个列中的列的索引（如果我理解正确的话）：

  ind1 = seq(2,length(sample_dat[1,]), 4) 
  column1 = gather( sample_dat[,ind1] )[2]

然后对所有其他3列重复：

  ind2 = seq(3,length(sample_dat[1,]), 4) 
  column2 = gather( sample_dat[,ind2] )[2]

您甚至可以使用for循环来完成这4列，而不必“手动”执行。然后将它们组合回数据框

Answer 3

值得考虑使用合适的列名（即"<variable_chr>.<time_num>"）。但是我们可以在一秒钟内解决它。

pfx <- c("first", "second", "third", "fourth")

names(sample_dat)[-1] <- sapply(names(sample_dat)[-1], function(x) {
  x <- gsub("_pick", "", x)
  if (lengths(strsplit(x, "_")) == 2)
    sub("(^.*)_(.*)", paste("\\2", which(pfx == sub("(^.*)_.+", "\\1", x)), sep="."), x)
  else
    paste0("rank.", which(pfx == x))
})

names(sample_dat)  # good names now
# [1] "subject_id" "rank.1"     "image.1"    "neuro.1"    "sex.1"      "rank.2"    
# [7] "image.2"    "neuro.2"    "sex.2"      "rank.3"     "image.3"    "neuro.3"   
# [13] "sex.3"      "rank.4"     "image.4"    "neuro.4"    "sex.4"

此后，我们可以轻松使用reshape。

reshape(sample_dat, idvar="subject_id", varying=2:17, direction="long", 
        new.row.names=seq(ncol(sample_dat) - 1))
#    subject_id time rank image neuro sex
# 1         sj1    1    1     a    TD   F
# 2         sj1    1    1     b    TD   F
# 3         sj1    1    1     c    TD   F
# 4         sj1    1    1     d    TD   F
# 5         sj1    2    2     e    TD   M
# 6         sj1    2    2     f    TD   M
# 7         sj1    2    2     g    TD   M
# 8         sj1    2    2     h    TD   M
# 9         sj1    3    3     i    DS   F
# 10        sj1    3    3     j    DS   F
# 11        sj1    3    3     k    DS   F
# 12        sj1    3    3     l    DS   F
# 13        sj1    4    4     m    DS   M
# 14        sj1    4    4     n    DS   M
# 15        sj1    4    4     o    DS   M
# 16        sj1    4    4     p    DS   M

数据

sample_dat <- structure(list(subject_id = structure(c(1L, 1L, 1L, 1L), .Label = "sj1", class = "factor"), 
    first_pick = c(1, 1, 1, 1), first_image_pick = structure(1:4, .Label = c("a", 
    "b", "c", "d"), class = "factor"), first_pick_neuro = structure(c(1L, 
    1L, 1L, 1L), .Label = "TD", class = "factor"), first_pick_sex = structure(c(1L, 
    1L, 1L, 1L), .Label = "F", class = "factor"), second_pick = c(2, 
    2, 2, 2), second_image_pick = structure(1:4, .Label = c("e", 
    "f", "g", "h"), class = "factor"), second_pick_neuro = structure(c(1L, 
    1L, 1L, 1L), .Label = "TD", class = "factor"), second_pick_sex = structure(c(1L, 
    1L, 1L, 1L), .Label = "M", class = "factor"), third_pick = c(3, 
    3, 3, 3), third_image_pick = structure(1:4, .Label = c("i", 
    "j", "k", "l"), class = "factor"), third_pick_neuro = structure(c(1L, 
    1L, 1L, 1L), .Label = "DS", class = "factor"), third_pick_sex = structure(c(1L, 
    1L, 1L, 1L), .Label = "F", class = "factor"), fourth_pick = c(4, 
    4, 4, 4), fourth_image_pick = structure(1:4, .Label = c("m", 
    "n", "o", "p"), class = "factor"), fourth_pick_neuro = structure(c(1L, 
    1L, 1L, 1L), .Label = "DS", class = "factor"), fourth_pick_sex = structure(c(1L, 
    1L, 1L, 1L), .Label = "M", class = "factor")), class = "data.frame", row.names = c(NA, 
-4L))

如何在R中的多个步骤中收集列而不丢失分组

3 个答案:

数据