我想在 R 中处理一个表格。其中两个单元格存储以逗号分隔的坐标(开始和结束)。我想分割这些坐标,使它们在各自的行上,但保留行中其他单元格的信息。
示例表:
header1 header2 start end
data1 data2 1,100,200 99,199,299
期望的输出:
data1 data2 1 99
data1 data2 100 199
data1 data2 200 299
如何在 R 中执行此操作?
答案 0 :(得分:2)
我假设您的表名为df
。 data.table
包使这种重塑变得微不足道......
require(data.table)
dt <- as.data.table( df )
dt[ , list(start = strsplit(start , ",", fixed=TRUE)[[1]],
end = strsplit(end , ",", fixed=TRUE)[[1]]
), by = c("header1","header2") ]
# header1 header2 start end
#1: data1 data2 1 99
#2: data1 data2 100 199
#3: data1 data2 200 299
答案 1 :(得分:1)
我非常喜欢Simon data.table
方法的优雅。这是一个老派的R版本:
# your original data
dat <- data.frame(header1="data1", header2="data2",
start="1,100,200", end="99,199,299")
dat
## header1 header2 start end
## 1 data1 data2 1,100,200 99,199,299
dat <- data.frame(dat[,c(1,2)],
start=do.call('cbind', strsplit(as.character(dat$start), ',')),
end=do.call('cbind', strsplit(as.character(dat$end), ',')))
dat
## header1 header2 start end
## 1 data1 data2 1 99
## 2 data1 data2 100 199
## 3 data1 data2 200 299
答案 2 :(得分:0)
我实际上会写一个看起来像这样的函数:
NewSplit <- function(indf, splitCols, sep = ",") {
Keys <- setdiff(names(indf), splitCols)
if (any(!vapply(indf[splitCols], is.character, logical(1L)))) {
indf[splitCols] <- lapply(indf[splitCols], as.character)
}
X <- setNames(lapply(indf[splitCols], function(x) {
strsplit(x, split = sep, fixed = TRUE)
}), splitCols)
Rep <- vapply(X[[1]], length, integer(1L))
cbind(indf[rep(rownames(indf), Rep), Keys],
lapply(X, unlist),
row.names = NULL,
stringsAsFactors = FALSE)
}
可以像这样使用:
NewSplit(dat, c("start", "end"), ",")
# header1 header2 id start end
# 1 A F 1 1 99
# 2 A F 1 100 199
# 3 A F 1 200 299
# 4 B G 1 11 33
# 5 B G 1 222 444
# 6 C H 1 10 72
# 7 D I 1 7 10
# 8 D I 1 8 9
# 9 D I 1 9 8
# 10 D I 1 10 7
# 11 D I 1 11 6
# 12 E J 1 1 3
其中&#34; dat&#34;定义为:
dat <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))
dat$id <- with(dat,
ave(rep(1, nrow(dat)),
header1, header2,
FUN = seq_along))
这实际上是一个非常快速的功能,因为使用的基本功能非常快。这是与&#34; data.table&#34;的比较。回答50K行。
dat2 <- do.call(rbind, replicate(10000, dat, FALSE))
dat2$id <- with(dat2,
ave(rep(1, nrow(dat2)),
header1, header2,
FUN = seq_along))
dim(dat2)
# [1] 50000 5
dt <- as.data.table(dat2)
fun1 <- function(dt = dt) {
dt[, list(
start = strsplit(as.character(start) , ",", fixed=TRUE)[[1]],
end = strsplit(as.character(end) , ",", fixed=TRUE)[[1]]),
by = list(header1, header2, id)]
}
fun2 <- function(df = dat2) {
NewSplit(df, c("start", "end"), ",")
}
all.equal(as.data.frame(fun1(dt)), fun2(dat2))
# [1] TRUE
system.time(fun1(dt))
# user system elapsed
# 1.953 0.009 1.999
system.time(fun2(dat2))
# user system elapsed
# 0.286 0.001 0.288