使用表格分组:参数

时间:2015-06-08 14:05:01

标签: r grouping

我最近问过question有关如何获取列的内容并将其用作新数据框中的列标题,其布尔表达式为1或0.如果它包含R中的值/ p>

一个例子是

Id.   Event
A.    Wc
B.    Df
C.    Df
A.    Df

需要转换为

     Wc df
A   1.    1
B   0.     1
C.  0.    1

我已经玩弄了它,似乎工作正常但是最近我收到了以下错误

FUN中的错误(X [[1L]],...):参数的“类型”(字符)无效

# get the totals by counting factors for SMS Type and number of replies 
cols <- c("SMS.Type", "Replied")
setDT(train)[, paste0(cols, ".count") := 
       lapply(.SD, function(x) length(unique(na.omit(x)))), 
     .SDcols = cols, 
     by = awb_no]


# Summerize a column and convert it to boolean column header
lst <- train$SMS.Type
lvl <- unique(unlist(lst))      
train.agg.chkpt <- data.frame(ID_no=train$ID_no,
          do.call(rbind,lapply(lst, function(x) table(factor(x,levels=lvl)))), 
          stringsAsFactors=FALSE)

train.agg.chkpt <- aggregate (train.agg.chkpt,by=list(ID_no=train.agg.chkpt$ID_no), FUN = "sum")
train.agg.chkpt <- train.agg.chkpt[c(-1)]

列ID_no只是一个ID号,这是布尔值被分组的ID。它是一个字符类型编号(我假设这是错误信息引用的内容)

每个ID都应该是唯一的。以下是数据集的结构

str(train.agg.chkpt)
'data.frame':   823462 obs. of  12 variables:
  $ ID_no  : chr  "AAAAAAA75465" "BBBBB175465" "CCCCCC75476" "DDDDD75476" ...
 $ WC      : int  1 0 0 1 0 0 0 1 0 1 ...
 $ DF1     : int  0 1 1 0 0 0 0 0 0 0 ...
 $ DF2     : int  0 0 0 0 1 1 1 0 1 0 ...
 $ WCB14   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WCA13   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ HN      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WCB13   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WCA12   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WCA14   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ WCB12   : int  0 0 0 0 0 0 0 0 0 0 ...

以下是traceback()

lapply(X = split(e, grp), FUN = FUN, ...)
4: FUN(X[[1L]], ...)
3: lapply(x, function(e) {
   ans <- lapply(X = split(e, grp), FUN = FUN, ...)
   if (simplify && length(len <- unique(sapply(ans, length))) == 
       1L) {
       if (len == 1L) {
           cl <- lapply(ans, oldClass)
           cl1 <- cl[[1L]]
           ans <- unlist(ans, recursive = FALSE)
           if (!is.null(cl1) && all(sapply(cl, function(x) identical(x, 
               cl1)))) 
               class(ans) <- cl1
       }
       else if (len > 1L) 
           ans <- matrix(unlist(ans, recursive = FALSE), nrow = nry, 
               ncol = len, byrow = TRUE, dimnames = {
                 if (!is.null(nms <- names(ans[[1L]]))) 
                   list(NULL, nms)
                 else NULL
               })
   }
   ans
   })
2: aggregate.data.frame(train.agg.chkpt, by = list(ID_no = train.agg.chkpt$ID_no), 
   FUN = "sum")
1: aggregate(train.agg.chkpt, by = list(ID_no = train.agg.chkpt$ID_no), 
   FUN = "sum")

任何人都可以帮我理解错误信息吗?

感谢您的时间

1 个答案:

答案 0 :(得分:4)

通过每个table的简单Id实施,您可以轻松获得所需的输出。这是一个可能的data.table(您已经使用过)实现

library(data.table)
setDT(df)[, as.list(table(Event)), by = Id]
#    Id Df Wc
# 1:  A  1  1
# 2:  B  1  0
# 3:  C  1  0

或者,(如建议的那样)您可以使用简单的dcast

dcast(setDT(df), Id ~ Event, fun = length, value.var = "Event")
#    Id Df Wc
# 1:  A  1  1
# 2:  B  1  0
# 3:  C  1  0

或类似地

library(reshape2)
dcast(df, Id ~ Event, fun = length, value.var = "Event")

或使用tidyr参见下面的注意

library(tidyr)
df$indx <- 1
spread(df, Event, indx, fill = 0) 
#   Id Df Wc
# 1  A  1  1
# 2  B  1  0
# 3  C  1  0

或使用基础R中的reshape参见下面的注意

reshape(df, idvar = "Id", timevar = "Event", direction = "wide", v.names = "indx")
#   Id indx.Wc indx.Df
# 1  A       1       1
# 2  B      NA       1
# 3  C      NA       1
  • 注意 spreadreshapeId具有相同Event的情况下不会在此处工作}不止一次,因为他们没有fun.aggregate参数,所以他们不知道如何处理它。

<强>基准

library(microbenchmark)
set.seed(123)
n <- 1e7
df <- data.frame(Id = sample(LETTERS, n, replace  = TRUE),
                 Event = sample(outer(LETTERS, letters, paste0), n, replace = TRUE))
dt <- copy(df)

DT1 <- function(x) setDT(x)[, as.list(table(Event)), by = Id]
DT2 <- function(x) dcast.data.table(setDT(x), Id ~ Event, fun = length, value.var = "Event")
RESHAPE2 <- function(x) dcast(x, Id ~ Event, fun = length, value.var = "Event")

microbenchmark(DT1(dt), DT2(dt), RESHAPE2(df))
# Unit: milliseconds
#         expr       min        lq      mean    median        uq       max neval
#      DT1(dt)  965.5181  987.8140 1017.8237 1007.1197 1030.7272 1285.9206   100
#      DT2(dt)  406.7124  420.6203  446.8026  434.2489  455.4364  592.4333   100
# RESHAPE2(df) 2969.0057 3035.5817 3190.6514 3099.3221 3240.4642 4384.6316   100

enter image description here