Question

我有一个表格，其中字符串格式的元素数量不等

File1 A  B  C
File2 A  B  D
File3 E  F

我想转换成以下格式

        A B C D E F
File1   1 1 1 0 0 0 
FIle2   1 1 0 1 0 0
File3   0 0 0 0 1 1

我尝试使用reshape2来做，但没有成功。

示例数据：

mydata <- structure(list(V1 = c("File1", "File2", "File3"), 
                         V2 = c("A", "A", "E"), V3 = c("B", "B", "F"), 
                         V4 = c("C", "D", "")), 
                   .Names = c("V1", "V2", "V3", "V4"), 
                   class = "data.frame", row.names = c(NA, -3L))

Answer 1

一种可能性：

library(reshape2)
df2 <- melt(df, id.var = "V1")
with(df2, table(V1, value))

#         value
# V1      A B C D E F
#   File1 1 1 1 0 0 0
#   File2 1 1 0 1 0 0
#   File3 0 0 0 0 1 1

Answer 2

一种合理有效的方法是使用我的＆＃34; splitstackshape＆＃34;中的（当前）非导出的charMat函数。包。由于未导出，因此您必须使用:::来访问它。

library(splitstackshape)
cbind(mydata[1], splitstackshape:::charMat(
  split.default(mydata[-1], sequence(ncol(mydata)-1)), fill=0))
#      V1 V1 A B C D E F
# 1 File1  0 1 1 1 0 0 0
# 2 File2  0 1 1 0 1 0 0
# 3 File3  1 0 0 0 0 1 1

在幕后，charMat利用矩阵索引来非常有效地处理所有内容。一步一步，这是charMat所做的。

X <- split.default(mydata[-1], sequence(ncol(mydata)-1))
len <- length(X)
vec <- unlist(X, use.names=FALSE)
lvl <- sort(unique(vec))
out <- matrix(0L, nrow = len, ncol = length(lvl), dimnames = list(NULL, lvl))
i.idx <- rep(seq.int(len), vapply(X, length, integer(1L)))
j.idx <- match(vec, lvl)
out[cbind(i.idx, j.idx)] <- 1
out
#        A B C D E F
# [1,] 0 1 1 1 0 0 0
# [2,] 0 1 1 0 1 0 0
# [3,] 1 0 0 0 0 1 1

看起来像是满口，但它实际上是一个非常快的操作，通过使用charMat功能更快： - ）

更新：基准

以下基准测试用我的charMat答案测试Henrik的答案，并调整Henrik的答案以使用＆＃34; data.table＆＃34;相反，为了更好的效率。

进行了两次测试。第一个是类似的数据集，有90K行，第二个是900K行。

以下是示例数据：

biggerdata <- do.call(rbind, replicate(30000, mydata, simplify = FALSE))
biggerdata$V1 <- make.unique(biggerdata$V1)
dim(biggerdata)
# [1] 90000     4

evenBigger <- do.call(rbind, replicate(10, biggerdata, simplify = FALSE))
evenBigger$V1 <- make.unique(evenBigger$V1)
dim(evenBigger)
# [1] 900000      4

以下是基准测试的功能：

fun1 <- function(indf) {
  cbind(indf[1], splitstackshape:::charMat(
    split.default(indf[-1], sequence(ncol(indf)-1)), fill=0))
}

library(reshape2)
fun2 <- function(indf) {
  df2 <- melt(indf, id.var = "V1")
  with(df2, table(V1, value))
}

library(data.table)
library(reshape2)
DT <- data.table(biggerdata)
DT2 <- data.table(evenBigger)

fun3 <- function(inDT) {
  DTL <- melt(inDT, id.vars="V1")
  dcast.data.table(DTL, V1 ~ value, fun.aggregate=length)
}

基准测试的结果。

library(microbenchmark)
microbenchmark(fun1(biggerdata), fun2(biggerdata), fun3(DT), times = 20)
# Unit: milliseconds
#                        expr       min        lq    median        uq       max neval
#            fun1(biggerdata)  185.3652  199.8725  289.0206  308.5826  327.4185    20
#            fun2(biggerdata) 1453.8791 1605.6053 1639.8567 1758.3984 1797.2229    20
#  suppressMessages(fun3(DT))  469.8979  570.4664  586.4715  598.6229  675.2961    20

microbenchmark(fun1(evenBigger), fun2(evenBigger), fun3(DT2), times = 5)
# Unit: seconds
#              expr       min        lq    median        uq       max neval
#  fun1(evenBigger)  1.871611  1.896351  2.071355  2.140580  2.464569     5
#  fun2(evenBigger) 26.911523 27.212910 27.363442 27.469812 27.938178     5
#         fun3(DT2)  7.103615  7.131603  7.141908  7.205006  7.218321     5

将数据帧转换为存在缺失矩阵

2 个答案:

更新：基准