仅供参考:我有解决这个问题的方法,但我想为有类似问题的人发布解决方案。
我有一个长格式的数据集
dat = data.frame(id = rep(1:2, each=10),
x1 = runif(20),
x2 = rnorm(20),
x3 = letters[1:20]
)
,其中一列标识每个id
dat$id_row = ave(as.character(dat$id), as.character(dat$id), FUN = seq_along)
并且某些值仅在每id
dat$x3[dat$id_row>1] = NA
所以dat
看起来像这样:
id x1 x2 x3 id_row
1 1 0.54788708 -0.3870533 a 1
2 1 0.41625779 0.1528117 <NA> 2
3 1 0.26278100 -0.9239091 <NA> 3
4 1 0.69968279 1.2025596 <NA> 4
5 1 0.26647728 0.3301025 <NA> 5
6 1 0.72934348 0.3218639 <NA> 6
7 1 0.43240532 -0.3136323 <NA> 7
8 1 0.98646191 -2.0612792 <NA> 8
9 1 0.69606418 1.7286719 <NA> 9
10 1 0.54025279 1.4872084 <NA> 10
11 2 0.33639210 -0.1692205 k 1
12 2 0.05372062 0.9645393 <NA> 2
13 2 0.46421037 -1.1011200 <NA> 3
14 2 0.66753436 -1.5613799 <NA> 4
15 2 0.72302407 -0.9173485 <NA> 5
16 2 0.26640179 0.7012415 <NA> 6
17 2 0.37752229 -0.1136814 <NA> 7
18 2 0.45110511 -0.2051627 <NA> 8
19 2 0.47969921 1.4154800 <NA> 9
20 2 0.23142331 -0.6178061 <NA> 10
我想使用像这样的data.table::dcast
将其转换为宽格式
library(data.table)
setDT(dat, key="id")
cols = colnames(dat)
cols = cols[!cols %in% c("id", "id_row")] #all the columns other than these
dat_wide = dcast(dat, id ~ id_row,
value.var = cols)
所以dat_wide
看起来像这样:
id x1_1 x1_10 x1_2 x1_3 x1_4 x1_5 x1_6 x1_7 x1_8 x1_9
1: 1 0.5478871 0.5402528 0.41625779 0.2627810 0.6996828 0.2664773 0.7293435 0.4324053 0.9864619 0.6960642
2: 2 0.3363921 0.2314233 0.05372062 0.4642104 0.6675344 0.7230241 0.2664018 0.3775223 0.4511051 0.4796992
x2_1 x2_10 x2_2 x2_3 x2_4 x2_5 x2_6 x2_7 x2_8 x2_9
1: -0.3870533 1.4872084 0.1528117 -0.9239091 1.20256 0.3301025 0.3218639 -0.3136323 -2.0612792 1.728672
2: -0.1692205 -0.6178061 0.9645393 -1.1011200 -1.56138 -0.9173485 0.7012415 -0.1136814 -0.2051627 1.415480
x3_1 x3_10 x3_2 x3_3 x3_4 x3_5 x3_6 x3_7 x3_8 x3_9
1: a NA NA NA NA NA NA NA NA NA
2: k NA NA NA NA NA NA NA NA NA
我想要以下内容:
x1_01
而不是x1_1
)_1
添加的dcast
。答案 0 :(得分:2)
你可以......
dcast(
copy(dat)[,
x3 := first(na.omit(x3)), by=id][,
id_row := sprintf("%02d", as.integer(id_row))],
id + x3 ~ id_row,
value.var = c("x1", "x2")
)
id x3 x1_01 x1_02 x1_03 x1_04 x1_05 x1_06 x1_07 x1_08 x1_09 x1_10 x2_01 x2_02 x2_03 x2_04 x2_05 x2_06 x2_07 x2_08 x2_09 x2_10
1: 1 a 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378 0.8879344 0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512
2: 2 k 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712 0.9256477 -0.7791544 -0.9860244 0.1341959 0.1352514 0.5140671 -0.1055499 -1.4497458 -1.526578088
也就是说,填写x3值,因为它们显然应该应用于每个id的所有行,并将x3视为分组变量; apply formatting to id_row;然后dcast。
如果x3应该在最后,你可以使用setcolorder
。
处理x3(显然是id的属性)的更简洁但更长的方法是创建一个id表:
# create id table
idDT = dat[!is.na(x3), .(id, x3)]
setkey(idDT, id)
# drop id attributes from main table
dat[, x3 := NULL]
# maybe drop id_row since it can be made on the fly
dat[, id_row := NULL]
# go wide
res = dcast(dat, id ~ sprintf("%02d", rowid(id)), value.var = setdiff(names(dat), "id"))
# add attributes back
res[idDT, on=key(idDT), x3 := i.x3]
id x1_01 x1_02 x1_03 x1_04 x1_05 x1_06 x1_07 x1_08 x1_09 x1_10 x2_01 x2_02 x2_03 x2_04 x2_05 x2_06 x2_07 x2_08 x2_09 x2_10 x3
1: 1 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378 0.8879344 0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512 a
2: 2 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712 0.9256477 -0.7791544 -0.9860244 0.1341959 0.1352514 0.5140671 -0.1055499 -1.4497458 -1.526578088 k
答案 1 :(得分:1)
要添加前导零,我这样做:
colnames(dat_wide) = gsub("(*_)(\\d{1})$", "\\10\\2", colnames(dat_wide))
#' gsub explanation:
#' arg1, first parentheses : find any string (of any length) that is followed by an underscore
#' arg1, second parentheses: find any single digit ("\\d" for any digit(s) and {1} to limit the number of matches to 1)
#' arg1, $: ensures that the regex in the second parentheses is at the end of the string
#' arg2, "\\1": the value in first parentheses of arg1
#' arg2, 0: zero gets inserted
#' arg2, "\\2": the value in second parentheses of arg1
从那里,我可以删除所有空列
class(dat_wide) = "data.frame" #necessary for the next step
dat_wide= dat_wide[, names(which(sapply(dat_wide, function(x) mean(is.na(x)))!=1))]
# sapply returns a vector giving the percent missing in each column, 100% missing = 1
# names(which(sapply(...)!=1)) gives the names of the columns that are not 100% missing
我删除了_1
添加的dcast
cols = colnames(dat_wide)
cols_alpha = gsub("_\\d{0,}$", "", cols)
# removes the underscore and any digits occurring at the end of the string
cols_unique = cols_alpha[!(cols_alpha %in% cols_alpha[duplicated(cols_alpha)])]
#base column names that aren't repeated
for(i in 1:length(cols_unique)){
temp = match.arg(cols_unique[i], cols)
colnames(dat_wide) = gsub(temp, cols_unique[i], colnames(dat_wide), fixed = T )
}
现在dat_wide
看起来像这样(我想要的方式)
dat_wide
id x1_01 x1_10 x1_02 x1_03 x1_04 x1_05 x1_06 x1_07 x1_08 x1_09
1 1 0.05139797 0.6901079 0.78155587 0.5717956 0.8652542 0.1341294 0.6745674 0.97447287 0.7684123 0.9038830
2 2 0.49687041 0.9880967 0.07189928 0.1835206 0.3563691 0.2008427 0.9795765 0.06875338 0.3590017 0.1253108
x2_01 x2_10 x2_02 x2_03 x2_04 x2_05 x2_06 x2_07 x2_08 x2_09 x3
1 0.27216865 -0.6608880 -0.9184810 -1.3895615 2.481393 1.8167519 0.4966844 0.1151766 1.119384 0.2650763 a
2 0.01208705 -0.5245594 -0.5032424 -0.5924594 1.037335 -0.1715831 -0.1698359 1.6956572 -1.275270 -0.1278430 k