这是我的数据框示例:
structure(list(user_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), press_id = c(1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), press_acc_mag = c(23.6537093679718,
23.0846851796042, 23.1615193166382, 23.1611737715988, 23.1693053238739,
22.6002811355062, 22.6771152725402, 22.6767697275009, 23.2636642131927,
22.6946400248251, 22.7714741618591, 22.7711286168198, 23.9096268064615,
23.3406026180939, 23.4174367551279, 23.4170912100885), release_acc_mag = c(22.444559535822,
22.3974678281557, 22.5370781850474, 22.3567189737439, 22.4517313344457,
22.4046396267793, 22.544249983671, 22.3638907723676, 22.4494140705383,
22.4023223628719, 22.5419327197636, 22.3615735084602, 22.4099252863741,
22.3628335787077, 22.5024439355994, 22.322084724296), max_acc = c(23.7327911876129,
23.8772090302736, 23.4628943410094, 23.4333331415454, 23.8558416360749,
24.0002594787356, 23.5859447894714, 23.5563835900074, 23.5419580147544,
23.686375857415, 23.2720611681509, 23.2424999686869, 24.0218455738806,
24.1662634165412, 23.7519487272771, 23.7223875278131), min_acc = c(22.2863243700941,
22.2026521966429, 22.1872267275715, 22.2835176932173, 22.2213018995416,
22.1376297260904, 22.122204257019, 22.2184952226648, 22.247136081127,
22.1634639076758, 22.1480384386044, 22.2443294042502, 22.2763940267469,
22.1927218532957, 22.1772963842243, 22.2735873498701), avg_acc = c(22.9971077741663,
22.8805170955298, 22.8162218339998, 22.8998658131877, 22.8198237605325,
22.7032330818959, 22.6389378203659, 22.7225817995538, 22.9432723377064,
22.8266816590698, 22.7623863975398, 22.8460303767277, 22.6997818029301,
22.5831911242936, 22.5188958627636, 22.6025398419515), press_vel_ang_mag = c(2.99159368861775,
3.69840841890355, 3.32010086461129, 3.59665109519773, 2.63983403890421,
3.3346628319386, 2.95635527764634, 3.19667346535676, 3.18228873692629,
3.88910346721209, 3.51079591291983, 3.78734614350627, 3.25113160166759,
3.94596039470198, 3.56765284040972, 3.80797102812014), release_vel_ang_mag = c(3.92468401207319,
4.03008319373796, 4.1763926713929, 3.97486508224465, 4.058922586184,
4.16432176784877, 4.31063124550371, 4.13278517230109, 3.99069606133371,
4.09609524299848, 4.24240472065342, 4.04087713150517, 4.02678472891242,
4.13218391057718, 4.27849338823212, 4.1006473150295), max_vel_ang = c(4.1156188310448,
4.35656997515235, 4.36732749036451, 4.27129525975646, 3.80582301429526,
4.04677415840281, 4.05753167361497, 3.96149944300692, 3.65238824784895,
3.8933393919565, 3.90409690716865, 3.8080646765606, 3.81641565599981,
4.05736680010736, 4.06812431531952, 3.97209208471146), min_vel_ang = c(3.16836561463486,
3.30917959689493, 3.4968727906284, 3.4259687986302, 2.94495943811947,
3.08577342037954, 3.27346661411301, 3.20256262211481, 2.79156870664285,
2.93238268890291, 3.12007588263639, 3.04917189063818, 3.17862422648419,
3.31943820874426, 3.50713140247773, 3.43622741047952), avg_vel_ang = c(3.49530401281139,
3.62360655957713, 3.71126066224395, 3.66119576608271, 3.31029178731189,
3.43859433407763, 3.52624843674446, 3.47618354058321, 3.36223808478038,
3.49054063154613, 3.57819473421295, 3.52812983805171, 3.47104738292443,
3.59934992969018, 3.687004032357, 3.63693913619575), press_size = c(2.08045207961665,
2.09221678961665, 2.08045207961665, 2.08437364761665, 2.07665439720744,
2.08841910720744, 2.07665439720744, 2.08057596520744, 2.07421744576499,
2.08598215576499, 2.07421744576499, 2.07813901376499, 2.07581316516794,
2.08757787516794, 2.07581316516794, 2.07973473316794), release_size = c(2.08792744972158,
2.08792744972158, 2.08400587572158, 2.08792744972158, 2.0914990670944,
2.0914990670944, 2.0875774930944, 2.0914990670944, 2.09029239656691,
2.09029239656691, 2.08637082256691, 2.09029239656691, 2.09468365117687,
2.09468365117687, 2.09076207717687, 2.09468365117687), intra_digit_time = c(76072848,
100906925, 65971232, 66473272, 76072850, 100906927, 65971234,
66473274, 76072848, 100906925, 65971232, 66473272, 76072850,
100906927, 65971234, 66473274), inter_digit_time = c(0, 229174998,
324746190, 496211113, 0, 229174996, 324746188, 496211111, 0,
229174998, 324746190, 496211113, 0, 229174996, 324746188, 496211111
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-16L), .Names = c("user_id", "press_id", "press_acc_mag", "release_acc_mag",
"max_acc", "min_acc", "avg_acc", "press_vel_ang_mag", "release_vel_ang_mag",
"max_vel_ang", "min_vel_ang", "avg_vel_ang", "press_size", "release_size",
"intra_digit_time", "inter_digit_time"))
我想将其转换为这样的2行数据集(空白值应带有适当的数字):
user_id press_acc_mag_press_id_1 press_acc_mag_press_id_2 press_acc_mag_press_id_3 press_acc_mag_press_id_4 release_acc_mag_press_id_1 release_acc_mag_press_id_2 release_acc_mag_press_id_3 release_acc_mag_press_id_4 .....
1
2
请告知如何将其转换为这种格式? 我正在尝试使用tidyr :: gather并传播:
library(dplyr)
library(tidyr)
df %>% filter(replicate %in% c(1,2), user_id %in% c(1,2)) %>% ungroup() %>%
select(user_id,
press_acc_mag, release_acc_mag, max_acc, min_acc, avg_acc,
press_vel_ang_mag, release_vel_ang_mag, max_vel_ang, min_vel_ang, avg_vel_ang,
press_size, release_size, intra_digit_time, inter_digit_time) %>%
gather(key, value)
但没有运气,请提出建议。
注意:在这里,每个用户ID都有4行,表示我需要X4列,因此每个列都应该有每个PRESS_ID(1-4)的前缀
答案 0 :(得分:3)
尽管该问题已用dplyr
和tidyr
标记,但我还是想指出为完整起见 dcast()
能够重塑多种价值列同时转换为宽格式。因此,可以完全跳过其他答案中使用的gather()
步骤。
此外,rowid()
函数用于方便地处理OP样本数据集中每个user_id
的多个记录。
此外,保留了列的原始顺序,如OP的预期输出(press_acc_mag_1
,press_acc_mag_2
等,而不是avg_acc_1
,avg_acc_2
等)所示< / p>
library(data.table) # version 1.11.4 used
dcast(setDT(df), user_id + rowid(sub_id <- press_id, user_id) ~ press_id,
value.var = setdiff(names(df), c("user_id", "press_id")))
user_id sub_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 1: 1 1 23.65371 23.08469 23.16152 23.16117 2: 1 2 23.26366 22.69464 22.77147 22.77113 3: 2 1 23.16931 22.60028 22.67712 22.67677 4: 2 2 23.90963 23.34060 23.41744 23.41709 release_acc_mag_1 release_acc_mag_2 release_acc_mag_3 release_acc_mag_4 max_acc_1 max_acc_2 1: 22.44456 22.39747 22.53708 22.35672 23.73279 23.87721 2: 22.44941 22.40232 22.54193 22.36157 23.54196 23.68638 3: 22.45173 22.40464 22.54425 22.36389 23.85584 24.00026 4: 22.40993 22.36283 22.50244 22.32208 24.02185 24.16626 max_acc_3 max_acc_4 min_acc_1 min_acc_2 min_acc_3 min_acc_4 avg_acc_1 avg_acc_2 avg_acc_3 1: 23.46289 23.43333 22.28632 22.20265 22.18723 22.28352 22.99711 22.88052 22.81622 2: 23.27206 23.24250 22.24714 22.16346 22.14804 22.24433 22.94327 22.82668 22.76239 3: 23.58594 23.55638 22.22130 22.13763 22.12220 22.21850 22.81982 22.70323 22.63894 4: 23.75195 23.72239 22.27639 22.19272 22.17730 22.27359 22.69978 22.58319 22.51890 ... ...
答案 1 :(得分:1)
library(tidyverse)
df[1:8,] %>% group_by(user_id) %>%
gather(key,value,-user_id, -press_id) %>% mutate(new=paste0(key,'_',press_id)) %>%
select(-press_id,-key) %>% spread(new,value)
答案 2 :(得分:1)
基本思想是首先收集值,以使数据集采用完全长格式,然后创建一个新变量,该变量是变量名和press_id的组合,然后散布回去。
在您的示例中,您以某种方式使每个用户ID翻了一番,因此我只使用了前8行。
请注意,您可以选择通过将变量转换为因子并保持原始顺序来保持列的顺序。
df[1:8,] %>% gather("variable", "value", -c(user_id, press_id)) %>%
mutate(X=paste(variable, press_id, sep="_")) %>%
select(-c(press_id, variable)) %>%
mutate_at("X", forcats::fct_inorder) %>%
spread(X, value)
## # A tibble: 2 x 57
## user_id press_acc_mag_1 press_acc_mag_2 press_acc_mag_3 press_acc_mag_4 release_acc_mag_1 release_acc_mag_2
## * <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 23.7 23.1 23.2 23.2 22.4 22.4
## 2 2 23.2 22.6 22.7 22.7
还有更多列,我不在这里显示。