我有一个data.table G
如下。
d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD86:RAY", "SD39:MATURE", "SD86:COSMIC"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"), c("SD32:TRUMPET", "SD1:FIELD", "SD40:GREEN", "SD40:PARK"))
d2 <- lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, function(x) paste0(unique(x), collapse=", "))
d2 <- lapply(d2, function(x) paste0(unique(x), collapse=", "))
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)
G <- data.table( d , key = "sd" )
G[, GNO:= seq(1,nrow(G))]
setcolorder(G, neworder=c("GNO", "sd", "sdw"))
G
GNO sd sdw
1: 1 SD1, SD44, SD384, SD32 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET
2: 2 SD12, SD315, SD387 SD12:PAINTING, SD315:MONEY31, SD387:SPRING
3: 3 SD23, SD1, SD567 SD23:SWITCH, SD1:LUSH, SD567:TREK
4: 4 SD32, SD1, SD40 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK
5: 5 SD345, SD340, SD387 SD345:HOLE, SD340:DUST, SD387:ROLL
6: 6 SD42, SD345, SD183 SD42:CRAYON, SD345:FOX, SD183:WIRE
7: 7 SD455, SD86, SD39 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC
我正在尝试拆分列sd
中的每组元素,并根据列GNO
获取修改后的data.table,如下所示。
G2 <- G[, list(sd = unlist(strsplit( sd , "," ))) , by = list(GNO, sdw)]
G2
GNO sdw sd
1: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD1
2: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD44
3: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD384
4: 1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET SD32
5: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD12
6: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD315
7: 2 SD12:PAINTING, SD315:MONEY31, SD387:SPRING SD387
8: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD23
9: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD1
10: 3 SD23:SWITCH, SD1:LUSH, SD567:TREK SD567
11: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD32
12: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD1
13: 4 SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK SD40
14: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD345
15: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD340
16: 5 SD345:HOLE, SD340:DUST, SD387:ROLL SD387
17: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD42
18: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD345
19: 6 SD42:CRAYON, SD345:FOX, SD183:WIRE SD183
20: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD455
21: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD86
22: 7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC SD39
我还想在sdw中拆分元素。但是当我尝试这个时,由于sd
和sdw
中的元素数量不一定相同,所有这一切都搞砸了。
t <- G[ , list(sd = unlist(strsplit(sd, "," )),
sdw = unlist(strsplit(sdw, "," ))) , by = list(GNO) ]
它发出警告
Warning message:
In `[.data.table`(G, , list(sd = unlist(strsplit(sd, ",")), sdw = unlist(strsplit(sdw, :
Column 1 of result for group 4 is length 3 but the longest column in this result is 4. Recycled leaving remainder of 1 items. This warning is once only for the first group with this issue.
我正在寻找的输出如下。
out <- structure(list(GNO = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
5, 5, 5, 6, 6, 6, 7, 7, 7), sd = structure(c(1L, 14L, 9L, 6L,
2L, 5L, 10L, 4L, 1L, 16L, 6L, 1L, 12L, 8L, 7L, 10L, 13L, 8L,
3L, 15L, 17L, 11L), .Label = c("SD1", "SD12", "SD183", "SD23",
"SD315", "SD32", "SD340", "SD345", "SD384", "SD387", "SD39",
"SD40", "SD42", "SD44", "SD455", "SD567", "SD86"), class = "factor"),
sdw = structure(c(2L, 17L, 11L, 7L, 3L, 6L, 13L, 5L, 2L,
19L, 7L, 1L, 15L, 10L, 8L, 12L, 16L, 9L, 4L, 18L, 20L, 14L
), .Label = c("SD1:FIELD", "SD1:LUSH", "SD12:PAINTING", "SD183:WIRE",
"SD23:SWITCH", "SD315:MONEY31", "SD32:TRUMPET", "SD340:DUST",
"SD345:FOX", "SD345:HOLE", "SD384:FR563", "SD387:ROLL", "SD387:SPRING",
"SD39:MATURE", "SD40:GREEN, SD40:PARK", "SD42:CRAYON", "SD44:CANCEL",
"SD455:TOMATO", "SD567:TREK", "SD86:COSMIC, SD86:RAY"), class = "factor")), .Names = c("GNO",
"sd", "sdw"), row.names = c(NA, -22L), class = "data.frame")
out
GNO sd sdw
1 1 SD1 SD1:LUSH
2 1 SD44 SD44:CANCEL
3 1 SD384 SD384:FR563
4 1 SD32 SD32:TRUMPET
5 2 SD12 SD12:PAINTING
6 2 SD315 SD315:MONEY31
7 2 SD387 SD387:SPRING
8 3 SD23 SD23:SWITCH
9 3 SD1 SD1:LUSH
10 3 SD567 SD567:TREK
11 4 SD32 SD32:TRUMPET
12 4 SD1 SD1:FIELD
13 4 SD40 SD40:GREEN, SD40:PARK
14 5 SD345 SD345:HOLE
15 5 SD340 SD340:DUST
16 5 SD387 SD387:ROLL
17 6 SD42 SD42:CRAYON
18 6 SD345 SD345:FOX
19 6 SD183 SD183:WIRE
20 7 SD455 SD455:TOMATO
21 7 SD86 SD86:COSMIC, SD86:RAY
22 7 SD39 SD39:MATURE
如何获得此输出?
答案 0 :(得分:6)
以下是我如何仅使用data.table
包来解决此问题(无需创建G2
)
此处我们首先按swd
进行拆分,然后按GNO
列:
列中stw
之前的部分进行汇总
G[, .(unlist(strsplit(sdw, ", ", TRUE))), GNO][,
.(stw = toString(V1)), .(GNO, sd = gsub(":.*", "", V1))]
# GNO sd stw
# 1: 1 SD1 SD1:LUSH
# 2: 1 SD44 SD44:CANCEL
# 3: 1 SD384 SD384:FR563
# 4: 1 SD32 SD32:TRUMPET
# 5: 2 SD12 SD12:PAINTING
# 6: 2 SD315 SD315:MONEY31
# 7: 2 SD387 SD387:SPRING
# 8: 3 SD23 SD23:SWITCH
# 9: 3 SD1 SD1:LUSH
# 10: 3 SD567 SD567:TREK
# 11: 4 SD32 SD32:TRUMPET
# 12: 4 SD1 SD1:FIELD
# 13: 4 SD40 SD40:GREEN, SD40:PARK
# 14: 5 SD345 SD345:HOLE
# 15: 5 SD340 SD340:DUST
# 16: 5 SD387 SD387:ROLL
# 17: 6 SD42 SD42:CRAYON
# 18: 6 SD345 SD345:FOX
# 19: 6 SD183 SD183:WIRE
# 20: 7 SD455 SD455:TOMATO
# 21: 7 SD86 SD86:RAY, SD86:COSMIC
# 22: 7 SD39 SD39:MATURE
答案 1 :(得分:2)
这可能对您数据的性质做出了太大的假设,但似乎&#34; sd&#34;列实际上有点无关紧要,因为该信息已经嵌入&#34; sdw&#34;列。
因此,这就是我提出的,使用我的&#34; splitstackshape&#34;中的两个cSplit
s。包:
library(splitstackshape)
temp <- cSplit(
cSplit(
G, "sdw", ",", "long"),
"sdw", ":")[, c(1, 3, 4), with = FALSE
][, list(paste(unique(sdw_1), unique(sdw_2),
sep = ":", collapse = ", ")),
by = list(GNO, sdw_1)]
setnames(temp, c("GNO", "sd", "sdw"))
temp
# GNO sd sdw
# 1: 1 SD1 SD1:LUSH
# 2: 1 SD44 SD44:CANCEL
# 3: 1 SD384 SD384:FR563
# 4: 1 SD32 SD32:TRUMPET
# 5: 2 SD12 SD12:PAINTING
# 6: 2 SD315 SD315:MONEY31
# 7: 2 SD387 SD387:SPRING
# 8: 3 SD23 SD23:SWITCH
# 9: 3 SD1 SD1:LUSH
# 10: 3 SD567 SD567:TREK
# 11: 4 SD32 SD32:TRUMPET
# 12: 4 SD1 SD1:FIELD
# 13: 4 SD40 SD40:GREEN, SD40:PARK
# 14: 5 SD345 SD345:HOLE
# 15: 5 SD340 SD340:DUST
# 16: 5 SD387 SD387:ROLL
# 17: 6 SD42 SD42:CRAYON
# 18: 6 SD345 SD345:FOX
# 19: 6 SD183 SD183:WIRE
# 20: 7 SD455 SD455:TOMATO
# 21: 7 SD86 SD86:RAY, SD86:COSMIC
# 22: 7 SD39 SD39:MATURE
# GNO sd sdw
library(compare)
compare(out, temp, allowAll = TRUE)
# TRUE
# [GNO] coerced from <integer> to <numeric>
# [sdw] coerced from <character> to <factor>
# dropped attributes
# [3] dropped attributes