使用非空列的名称创建新列

时间:2016-03-30 09:13:56

标签: r data.table

我的数据集如下所示:

library(data.table)

df <- data.table(a = c(1,2,3,4,5),
                 b = c(1,0,2,5,1),
                 c = c(0,1,1,0,0),
                 d = c(1,0,0,2,2))

df
#    a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2

我想创建一个包含非空列名称的新列。结果将是:

df_result <- data.table(a = c(1,2,3,4,5),
                        z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))

df_result
#    a   z
# 1: 1 b_d
# 2: 2   c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d

4 个答案:

答案 0 :(得分:12)

假设nrow >> ncol,您可以按列工作

ff = function(x)
{
    ans = character(nrow(x))
    for(j in seq_along(x)) {
        i = x[[j]] > 0L
        ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
    }
    return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c"   "b_c" "b_d" "b_d"

答案 1 :(得分:8)

一种选择是将格式转换为广泛的格式。长期&#39;使用melt。由&#39; a&#39;组成,我们paste&#39;变量&#39;对应于&#39;值&#39;中的非零元素的元素(在&#39; i&#39;中作为逻辑条件提供)。

melt(df, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d

或者代替melt,我们可以将数据子集(unlist)和.SD分组为&#39; a&#39;,paste。与非零元素对应的列的names(&#39; i1&#39;)。

df[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]

基准

set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6, 
   replace=TRUE), c = sample(0:4, 1e6, replace=TRUE), 
    d = sample(0:3, 1e6, replace=TRUE))

akrun1 <- function() {
   melt(df1, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
    }

 akrun2 <- function() {
   df1[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]
   }

 ronak <- function() {
    data.table(z = lapply(apply(df1, 1, function(x)
                which(x[-1]!= 0)), 
       function(x) paste0(names(x), collapse = "_")))
   }

eddi <- function(){
 df1[, newcol := gsub("NA_|_NA|NA", "",                          
   do.call(function(...) paste(..., sep = "_"),            
     Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
 , .SDcols = b:d]

 }

alexis = function(x)
   {
   ans = character(nrow(x))
   for(j in seq_along(x)) {
    i = x[[j]] > 0L
    ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
   }
  return(gsub("^_", "", ans))
}





system.time(akrun1())
#   user  system elapsed 
#  22.04    0.15   22.36 
 system.time(akrun2())
#   user  system elapsed 
# 26.33    0.00   26.41 
 system.time(ronak())
#   user  system elapsed 
#  25.60    0.26   25.96 


system.time(alexis(df1[, -1L, with = FALSE]))
#   user  system elapsed 
#   1.92    0.06    2.09 

system.time(eddi())
#  user  system elapsed 
#   2.41    0.06    3.19 

答案 2 :(得分:8)

这是一个直接的方法:

df[, newcol := gsub("NA_|_NA|NA", "",                           # remove unwanted text
       do.call(function(...) paste(..., sep = "_"),             # paste colnames together
         Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
   , .SDcols = b:d]
#   a b c d newcol
#1: 1 1 0 1    b_d
#2: 2 0 1 0      c
#3: 3 2 1 0    b_c
#4: 4 5 0 2    b_d
#5: 5 1 0 2    b_d

在akrun的测试数据上,它的速度提高了10倍。

答案 3 :(得分:4)

这可能有点冗长。

对于找到值不为0的列的每一行,然后将列名称粘贴在一起。

data.table(a= df$a, z = lapply(apply(df, 1, 
           function(x) which(x[-1]!= 0)), 
           function(x) paste0(names(x), collapse = "_")))


#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d