我的数据集如下所示:
library(data.table)
df <- data.table(a = c(1,2,3,4,5),
b = c(1,0,2,5,1),
c = c(0,1,1,0,0),
d = c(1,0,0,2,2))
df
# a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2
我想创建一个包含非空列名称的新列。结果将是:
df_result <- data.table(a = c(1,2,3,4,5),
z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))
df_result
# a z
# 1: 1 b_d
# 2: 2 c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d
答案 0 :(得分:12)
假设nrow >> ncol
,您可以按列工作
ff = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c" "b_c" "b_d" "b_d"
答案 1 :(得分:8)
一种选择是将格式转换为广泛的格式。长期&#39;使用melt
。由&#39; a&#39;组成,我们paste
&#39;变量&#39;对应于&#39;值&#39;中的非零元素的元素(在&#39; i&#39;中作为逻辑条件提供)。
melt(df, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
# a z
#1: 1 b_d
#2: 2 c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d
或者代替melt
,我们可以将数据子集(unlist
)和.SD
分组为&#39; a&#39;,paste
。与非零元素对应的列的names
(&#39; i1&#39;)。
df[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6,
replace=TRUE), c = sample(0:4, 1e6, replace=TRUE),
d = sample(0:3, 1e6, replace=TRUE))
akrun1 <- function() {
melt(df1, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
}
akrun2 <- function() {
df1[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
}
ronak <- function() {
data.table(z = lapply(apply(df1, 1, function(x)
which(x[-1]!= 0)),
function(x) paste0(names(x), collapse = "_")))
}
eddi <- function(){
df1[, newcol := gsub("NA_|_NA|NA", "",
do.call(function(...) paste(..., sep = "_"),
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
, .SDcols = b:d]
}
alexis = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
system.time(akrun1())
# user system elapsed
# 22.04 0.15 22.36
system.time(akrun2())
# user system elapsed
# 26.33 0.00 26.41
system.time(ronak())
# user system elapsed
# 25.60 0.26 25.96
system.time(alexis(df1[, -1L, with = FALSE]))
# user system elapsed
# 1.92 0.06 2.09
system.time(eddi())
# user system elapsed
# 2.41 0.06 3.19
答案 2 :(得分:8)
这是一个直接的方法:
df[, newcol := gsub("NA_|_NA|NA", "", # remove unwanted text
do.call(function(...) paste(..., sep = "_"), # paste colnames together
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
, .SDcols = b:d]
# a b c d newcol
#1: 1 1 0 1 b_d
#2: 2 0 1 0 c
#3: 3 2 1 0 b_c
#4: 4 5 0 2 b_d
#5: 5 1 0 2 b_d
在akrun的测试数据上,它的速度提高了10倍。
答案 3 :(得分:4)
这可能有点冗长。
对于找到值不为0的列的每一行,然后将列名称粘贴在一起。
data.table(a= df$a, z = lapply(apply(df, 1,
function(x) which(x[-1]!= 0)),
function(x) paste0(names(x), collapse = "_")))
# a z
#1: 1 b_d
#2: 2 c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d