将较长的数据集转化为宽数据,但会扭曲R?

时间:2018-08-08 14:51:32

标签: r dataframe dplyr tidyr

这是我的数据框示例:

structure(list(user_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L), press_id = c(1L, 2L, 3L, 4L, 1L, 
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), press_acc_mag = c(23.6537093679718, 
23.0846851796042, 23.1615193166382, 23.1611737715988, 23.1693053238739, 
22.6002811355062, 22.6771152725402, 22.6767697275009, 23.2636642131927, 
22.6946400248251, 22.7714741618591, 22.7711286168198, 23.9096268064615, 
23.3406026180939, 23.4174367551279, 23.4170912100885), release_acc_mag = c(22.444559535822, 
22.3974678281557, 22.5370781850474, 22.3567189737439, 22.4517313344457, 
22.4046396267793, 22.544249983671, 22.3638907723676, 22.4494140705383, 
22.4023223628719, 22.5419327197636, 22.3615735084602, 22.4099252863741, 
22.3628335787077, 22.5024439355994, 22.322084724296), max_acc = c(23.7327911876129, 
23.8772090302736, 23.4628943410094, 23.4333331415454, 23.8558416360749, 
24.0002594787356, 23.5859447894714, 23.5563835900074, 23.5419580147544, 
23.686375857415, 23.2720611681509, 23.2424999686869, 24.0218455738806, 
24.1662634165412, 23.7519487272771, 23.7223875278131), min_acc = c(22.2863243700941, 
22.2026521966429, 22.1872267275715, 22.2835176932173, 22.2213018995416, 
22.1376297260904, 22.122204257019, 22.2184952226648, 22.247136081127, 
22.1634639076758, 22.1480384386044, 22.2443294042502, 22.2763940267469, 
22.1927218532957, 22.1772963842243, 22.2735873498701), avg_acc = c(22.9971077741663, 
22.8805170955298, 22.8162218339998, 22.8998658131877, 22.8198237605325, 
22.7032330818959, 22.6389378203659, 22.7225817995538, 22.9432723377064, 
22.8266816590698, 22.7623863975398, 22.8460303767277, 22.6997818029301, 
22.5831911242936, 22.5188958627636, 22.6025398419515), press_vel_ang_mag = c(2.99159368861775, 
3.69840841890355, 3.32010086461129, 3.59665109519773, 2.63983403890421, 
3.3346628319386, 2.95635527764634, 3.19667346535676, 3.18228873692629, 
3.88910346721209, 3.51079591291983, 3.78734614350627, 3.25113160166759, 
3.94596039470198, 3.56765284040972, 3.80797102812014), release_vel_ang_mag = c(3.92468401207319, 
4.03008319373796, 4.1763926713929, 3.97486508224465, 4.058922586184, 
4.16432176784877, 4.31063124550371, 4.13278517230109, 3.99069606133371, 
4.09609524299848, 4.24240472065342, 4.04087713150517, 4.02678472891242, 
4.13218391057718, 4.27849338823212, 4.1006473150295), max_vel_ang = c(4.1156188310448, 
4.35656997515235, 4.36732749036451, 4.27129525975646, 3.80582301429526, 
4.04677415840281, 4.05753167361497, 3.96149944300692, 3.65238824784895, 
3.8933393919565, 3.90409690716865, 3.8080646765606, 3.81641565599981, 
4.05736680010736, 4.06812431531952, 3.97209208471146), min_vel_ang = c(3.16836561463486, 
3.30917959689493, 3.4968727906284, 3.4259687986302, 2.94495943811947, 
3.08577342037954, 3.27346661411301, 3.20256262211481, 2.79156870664285, 
2.93238268890291, 3.12007588263639, 3.04917189063818, 3.17862422648419, 
3.31943820874426, 3.50713140247773, 3.43622741047952), avg_vel_ang = c(3.49530401281139, 
3.62360655957713, 3.71126066224395, 3.66119576608271, 3.31029178731189, 
3.43859433407763, 3.52624843674446, 3.47618354058321, 3.36223808478038, 
3.49054063154613, 3.57819473421295, 3.52812983805171, 3.47104738292443, 
3.59934992969018, 3.687004032357, 3.63693913619575), press_size = c(2.08045207961665, 
2.09221678961665, 2.08045207961665, 2.08437364761665, 2.07665439720744, 
2.08841910720744, 2.07665439720744, 2.08057596520744, 2.07421744576499, 
2.08598215576499, 2.07421744576499, 2.07813901376499, 2.07581316516794, 
2.08757787516794, 2.07581316516794, 2.07973473316794), release_size = c(2.08792744972158, 
2.08792744972158, 2.08400587572158, 2.08792744972158, 2.0914990670944, 
2.0914990670944, 2.0875774930944, 2.0914990670944, 2.09029239656691, 
2.09029239656691, 2.08637082256691, 2.09029239656691, 2.09468365117687, 
2.09468365117687, 2.09076207717687, 2.09468365117687), intra_digit_time = c(76072848, 
100906925, 65971232, 66473272, 76072850, 100906927, 65971234, 
66473274, 76072848, 100906925, 65971232, 66473272, 76072850, 
100906927, 65971234, 66473274), inter_digit_time = c(0, 229174998, 
324746190, 496211113, 0, 229174996, 324746188, 496211111, 0, 
229174998, 324746190, 496211113, 0, 229174996, 324746188, 496211111
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-16L), .Names = c("user_id", "press_id", "press_acc_mag", "release_acc_mag", 
"max_acc", "min_acc", "avg_acc", "press_vel_ang_mag", "release_vel_ang_mag", 
"max_vel_ang", "min_vel_ang", "avg_vel_ang", "press_size", "release_size", 
"intra_digit_time", "inter_digit_time"))

我想将其转换为这样的2行数据集(空白值应带有适当的数字):

user_id press_acc_mag_press_id_1 press_acc_mag_press_id_2 press_acc_mag_press_id_3 press_acc_mag_press_id_4 release_acc_mag_press_id_1 release_acc_mag_press_id_2 release_acc_mag_press_id_3 release_acc_mag_press_id_4 ..... 
    1
    2

请告知如何将其转换为这种格式? 我正在尝试使用tidyr :: gather并传播:

library(dplyr)
library(tidyr)
df %>% filter(replicate %in% c(1,2), user_id %in% c(1,2)) %>% ungroup() %>%
  select(user_id, 
         press_acc_mag, release_acc_mag, max_acc, min_acc, avg_acc,
         press_vel_ang_mag, release_vel_ang_mag, max_vel_ang, min_vel_ang, avg_vel_ang,
         press_size, release_size, intra_digit_time, inter_digit_time) %>% 
  gather(key, value)

但没有运气,请提出建议。

注意:在这里,每个用户ID都有4行,表示我需要X4列,因此每个列都应该有每个PRESS_ID(1-4)的前缀

3 个答案:

答案 0 :(得分:3)

尽管该问题已用dplyrtidyr标记,但我还是想指出为完整起见 dcast()能够重塑多种价值列同时转换为宽格式。因此,可以完全跳过其他答案中使用的gather()步骤。

此外,rowid()函数用于方便地处理OP样本数据集中每个user_id的多个记录。

此外,保留了列的原始顺序,如OP的预期输出(press_acc_mag_1press_acc_mag_2等,而不是avg_acc_1avg_acc_2等)所示< / p>

library(data.table)   # version 1.11.4 used
dcast(setDT(df), user_id + rowid(sub_id <- press_id, user_id) ~ press_id, 
      value.var = setdiff(names(df), c("user_id", "press_id")))
   user_id sub_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4
1:       1      1        23.65371        23.08469        23.16152        23.16117
2:       1      2        23.26366        22.69464        22.77147        22.77113
3:       2      1        23.16931        22.60028        22.67712        22.67677
4:       2      2        23.90963        23.34060        23.41744        23.41709
   release_acc_mag_1 release_acc_mag_2 release_acc_mag_3 release_acc_mag_4 max_acc_1 max_acc_2
1:          22.44456          22.39747          22.53708          22.35672  23.73279  23.87721
2:          22.44941          22.40232          22.54193          22.36157  23.54196  23.68638
3:          22.45173          22.40464          22.54425          22.36389  23.85584  24.00026
4:          22.40993          22.36283          22.50244          22.32208  24.02185  24.16626
   max_acc_3 max_acc_4 min_acc_1 min_acc_2 min_acc_3 min_acc_4 avg_acc_1 avg_acc_2 avg_acc_3
1:  23.46289  23.43333  22.28632  22.20265  22.18723  22.28352  22.99711  22.88052  22.81622
2:  23.27206  23.24250  22.24714  22.16346  22.14804  22.24433  22.94327  22.82668  22.76239
3:  23.58594  23.55638  22.22130  22.13763  22.12220  22.21850  22.81982  22.70323  22.63894
4:  23.75195  23.72239  22.27639  22.19272  22.17730  22.27359  22.69978  22.58319  22.51890
...
...

答案 1 :(得分:1)

 library(tidyverse)
 df[1:8,] %>% group_by(user_id) %>%
              gather(key,value,-user_id, -press_id) %>% mutate(new=paste0(key,'_',press_id)) %>% 
              select(-press_id,-key) %>% spread(new,value)

答案 2 :(得分:1)

基本思想是首先收集值,以使数据集采用完全长格式,然后创建一个新变量,该变量是变量名和press_id的组合,然后散布回去。

在您的示例中,您以某种方式使每个用户ID翻了一番,因此我只使用了前8行。

请注意,您可以选择通过将变量转换为因子并保持原始顺序来保持列的顺序。

df[1:8,] %>% gather("variable", "value", -c(user_id, press_id)) %>% 
  mutate(X=paste(variable, press_id, sep="_")) %>%
  select(-c(press_id, variable)) %>%
  mutate_at("X", forcats::fct_inorder) %>%
  spread(X, value)

## # A tibble: 2 x 57
##   user_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 release_acc_mag_1 release_acc_mag_2
## *   <int>           <dbl>           <dbl>           <dbl>           <dbl>             <dbl>             <dbl>
## 1       1            23.7            23.1            23.2            23.2              22.4              22.4
## 2       2            23.2            22.6            22.7            22.7  

还有更多列,我不在这里显示。