整理数据集

时间:2019-11-30 15:55:18

标签: r tidyverse

一直在努力整理数据集。简要说明:每行代表一个人,然后在所有具有分数的日期中,用“日期”,“ ScoreA”,“ ScoreB”等重复列。希望能将所有日期都放在一列中,并将相应的分数放在相邻的列中。尝试了简单的pivot_longer或pivot_wider的各种组合,但到目前为止没有成功。样本文件位于

https://github.com/tueland/R_help.git

感谢您的帮助!

1 个答案:

答案 0 :(得分:0)

我们可以在完成pivot_longer后使用names_repair

library(tidyr)
library(dplyr)

nm1 <- sub("\\.?\\d+$", "", names(df1)[-1])
names(df1)[-1] <- paste0(nm1, "_", ave(nm1, nm1, FUN = seq_along))
df1 %>%
    select(-c("X_1", "X_2")) %>% 
    pivot_longer(cols = -ID, names_to = c(".value", "group"), 
            names_sep="_", values_drop_na = TRUE ) %>% 
    na.omit
# A tibble: 5 x 6
#     ID group Date   ScoreA ScoreB ScoreC
#  <int> <chr> <fct>   <dbl>  <dbl>  <dbl>
#1   123 1     1/1/11    1      2      3  
#2   123 2     1/2/11    4      5      6  
#3   123 3     1/4/11    6.1    6.2    6.3
#4   345 1     2/2/22    7      8      9  
#5   345 2     2/3/22   10     11     12  

或与map

library(purrr)
library(stringr)
map_dfr(1:3, ~ df1 %>%
                 select(ID, ends_with(as.character(.x))) %>% 
                 rename_all(~ str_remove(., "_\\d+$")))  %>% 
         select(-X) %>% 
         na.omit
#   ID   Date ScoreA ScoreB ScoreC
#1  123 1/1/11    1.0    2.0    3.0
#2  345 2/2/22    7.0    8.0    9.0
#6  123 1/2/11    4.0    5.0    6.0
#7  345 2/3/22   10.0   11.0   12.0
#11 123 1/4/11    6.1    6.2    6.3

或者使用melt中的data.table

library(data.table)
na.omit(melt(setDT(df1), measure = patterns("^Date", "ScoreA", 
   "ScoreB", "ScoreC", "^X"),
    value.name = c("Date", "ScoreA", "ScoreB", "ScoreC", "X"))[, X := NULL])
#    ID variable   Date ScoreA ScoreB ScoreC
#1: 123        1 1/1/11    1.0    2.0    3.0
#2: 345        1 2/2/22    7.0    8.0    9.0
#3: 123        2 1/2/11    4.0    5.0    6.0
#4: 345        2 2/3/22   10.0   11.0   12.0
#5: 123        3 1/4/11    6.1    6.2    6.3

数据

df1 <- read.csv("https://raw.githubusercontent.com/tueland/R_help/master/R%20help.csv")