Question

仅供参考：我有解决这个问题的方法，但我想为有类似问题的人发布解决方案。

我有一个长格式的数据集

dat = data.frame(id = rep(1:2, each=10),
             x1 = runif(20),
             x2 = rnorm(20),
             x3 = letters[1:20]
             )

，其中一列标识每个id

中的行顺序

dat$id_row = ave(as.character(dat$id), as.character(dat$id), FUN = seq_along)

并且某些值仅在每id

时出现一次

dat$x3[dat$id_row>1] = NA

所以dat看起来像这样：

   id         x1         x2   x3 id_row
1   1 0.54788708 -0.3870533    a      1
2   1 0.41625779  0.1528117 <NA>      2
3   1 0.26278100 -0.9239091 <NA>      3
4   1 0.69968279  1.2025596 <NA>      4
5   1 0.26647728  0.3301025 <NA>      5
6   1 0.72934348  0.3218639 <NA>      6
7   1 0.43240532 -0.3136323 <NA>      7
8   1 0.98646191 -2.0612792 <NA>      8
9   1 0.69606418  1.7286719 <NA>      9
10  1 0.54025279  1.4872084 <NA>     10
11  2 0.33639210 -0.1692205    k      1
12  2 0.05372062  0.9645393 <NA>      2
13  2 0.46421037 -1.1011200 <NA>      3
14  2 0.66753436 -1.5613799 <NA>      4
15  2 0.72302407 -0.9173485 <NA>      5
16  2 0.26640179  0.7012415 <NA>      6
17  2 0.37752229 -0.1136814 <NA>      7
18  2 0.45110511 -0.2051627 <NA>      8
19  2 0.47969921  1.4154800 <NA>      9
20  2 0.23142331 -0.6178061 <NA>     10

我想使用像这样的data.table::dcast将其转换为宽格式

library(data.table)
setDT(dat, key="id")

cols = colnames(dat)
cols = cols[!cols %in% c("id", "id_row")]  #all the columns other than these

dat_wide = dcast(dat, id ~ id_row,
                   value.var = cols)

所以dat_wide看起来像这样：

   id      x1_1     x1_10       x1_2      x1_3      x1_4      x1_5      x1_6      x1_7      x1_8      x1_9
1:  1 0.5478871 0.5402528 0.41625779 0.2627810 0.6996828 0.2664773 0.7293435 0.4324053 0.9864619 0.6960642
2:  2 0.3363921 0.2314233 0.05372062 0.4642104 0.6675344 0.7230241 0.2664018 0.3775223 0.4511051 0.4796992
         x2_1      x2_10      x2_2       x2_3     x2_4       x2_5      x2_6       x2_7       x2_8     x2_9
1: -0.3870533  1.4872084 0.1528117 -0.9239091  1.20256  0.3301025 0.3218639 -0.3136323 -2.0612792 1.728672
2: -0.1692205 -0.6178061 0.9645393 -1.1011200 -1.56138 -0.9173485 0.7012415 -0.1136814 -0.2051627 1.415480
    x3_1 x3_10 x3_2 x3_3 x3_4 x3_5 x3_6 x3_7 x3_8 x3_9
1:     a    NA   NA   NA   NA   NA   NA   NA   NA   NA
2:     k    NA   NA   NA   NA   NA   NA   NA   NA   NA

我想要以下内容：

列名称的前导零（例如x1_01而不是x1_1）
删除所有空列
对于只有一次观察的列，请移除_1添加的dcast。

Answer 1

你可以......

dcast(
  copy(dat)[, 
    x3 := first(na.omit(x3)), by=id][, 
    id_row := sprintf("%02d", as.integer(id_row))], 
  id + x3 ~ id_row,
  value.var = c("x1", "x2")
)

   id x3     x1_01     x1_02      x1_03      x1_04     x1_05     x1_06     x1_07      x1_08      x1_09     x1_10     x2_01      x2_02      x2_03      x2_04      x2_05     x2_06      x2_07      x2_08      x2_09        x2_10
1:  1  a 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378  0.8879344  0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512
2:  2  k 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712  0.9256477 -0.7791544 -0.9860244  0.1341959 0.1352514  0.5140671 -0.1055499 -1.4497458 -1.526578088

也就是说，填写x3值，因为它们显然应该应用于每个id的所有行，并将x3视为分组变量; apply formatting to id_row;然后dcast。

如果x3应该在最后，你可以使用setcolorder。

处理x3（显然是id的属性）的更简洁但更长的方法是创建一个id表：

# create id table
idDT = dat[!is.na(x3), .(id, x3)]
setkey(idDT, id)

# drop id attributes from main table
dat[, x3 := NULL]

# maybe drop id_row since it can be made on the fly
dat[, id_row := NULL]

# go wide
res = dcast(dat, id ~ sprintf("%02d", rowid(id)), value.var = setdiff(names(dat), "id"))

# add attributes back
res[idDT, on=key(idDT), x3 := i.x3]

   id     x1_01     x1_02      x1_03      x1_04     x1_05     x1_06     x1_07      x1_08      x1_09     x1_10     x2_01      x2_02      x2_03      x2_04      x2_05     x2_06      x2_07      x2_08      x2_09        x2_10 x3
1:  1 0.1378046 0.5520807 0.05924109 0.06332558 0.7777398 0.9895027 0.2064026 0.03098261 0.95197243 0.7477726 0.7604617 -0.3261378  0.8879344  0.5863483 -0.7902251 1.5264813 -0.7777892 -0.6412351 -0.7436965 -0.006662512  a
2:  2 0.3795247 0.8378224 0.66084335 0.34242055 0.6004159 0.6608979 0.4316877 0.63934958 0.02383587 0.1881800 1.9792712  0.9256477 -0.7791544 -0.9860244  0.1341959 0.1352514  0.5140671 -0.1055499 -1.4497458 -1.526578088  k

Answer 2

要添加前导零，我这样做：

colnames(dat_wide) = gsub("(*_)(\\d{1})$",  "\\10\\2", colnames(dat_wide))
#'  gsub explanation:
#'  arg1, first parentheses : find any string (of any length) that is followed by an underscore
#'  arg1, second parentheses: find any single digit ("\\d" for any digit(s) and {1} to limit the number of matches to 1)
#'  arg1, $: ensures that the regex in the second parentheses is at the end of the string
#'  arg2, "\\1": the value in first parentheses of arg1
#'  arg2, 0: zero gets inserted
#'  arg2, "\\2": the value in second parentheses of arg1

从那里，我可以删除所有空列

class(dat_wide) = "data.frame"  #necessary for the next step
dat_wide= dat_wide[, names(which(sapply(dat_wide, function(x) mean(is.na(x)))!=1))]
    # sapply returns a vector giving the percent missing in each column, 100% missing = 1
    # names(which(sapply(...)!=1)) gives the names of the columns that are not 100% missing

我删除了_1添加的dcast

cols = colnames(dat_wide)

cols_alpha = gsub("_\\d{0,}$", "", cols)
# removes the underscore and any digits occurring at the end of the string


cols_unique = cols_alpha[!(cols_alpha %in% cols_alpha[duplicated(cols_alpha)])]
#base column names that aren't repeated

for(i in 1:length(cols_unique)){
  temp = match.arg(cols_unique[i], cols)
  colnames(dat_wide) = gsub(temp, cols_unique[i], colnames(dat_wide), fixed = T )
}

现在dat_wide看起来像这样（我想要的方式）

dat_wide
  id      x1_01     x1_10      x1_02     x1_03     x1_04     x1_05     x1_06      x1_07     x1_08     x1_09
1  1 0.05139797 0.6901079 0.78155587 0.5717956 0.8652542 0.1341294 0.6745674 0.97447287 0.7684123 0.9038830
2  2 0.49687041 0.9880967 0.07189928 0.1835206 0.3563691 0.2008427 0.9795765 0.06875338 0.3590017 0.1253108
       x2_01      x2_10      x2_02      x2_03    x2_04      x2_05      x2_06     x2_07     x2_08      x2_09 x3
1 0.27216865 -0.6608880 -0.9184810 -1.3895615 2.481393  1.8167519  0.4966844 0.1151766  1.119384  0.2650763  a
2 0.01208705 -0.5245594 -0.5032424 -0.5924594 1.037335 -0.1715831 -0.1698359 1.6956572 -1.275270 -0.1278430  k

在R中重新整形后对列进行编程重命名

2 个答案: