我最近问过question有关如何获取列的内容并将其用作新数据框中的列标题,其布尔表达式为1或0.如果它包含R中的值/ p>
一个例子是
Id. Event
A. Wc
B. Df
C. Df
A. Df
需要转换为
Wc df
A 1. 1
B 0. 1
C. 0. 1
我已经玩弄了它,似乎工作正常但是最近我收到了以下错误
FUN中的错误(X [[1L]],...):参数的“类型”(字符)无效
# get the totals by counting factors for SMS Type and number of replies
cols <- c("SMS.Type", "Replied")
setDT(train)[, paste0(cols, ".count") :=
lapply(.SD, function(x) length(unique(na.omit(x)))),
.SDcols = cols,
by = awb_no]
# Summerize a column and convert it to boolean column header
lst <- train$SMS.Type
lvl <- unique(unlist(lst))
train.agg.chkpt <- data.frame(ID_no=train$ID_no,
do.call(rbind,lapply(lst, function(x) table(factor(x,levels=lvl)))),
stringsAsFactors=FALSE)
train.agg.chkpt <- aggregate (train.agg.chkpt,by=list(ID_no=train.agg.chkpt$ID_no), FUN = "sum")
train.agg.chkpt <- train.agg.chkpt[c(-1)]
列ID_no只是一个ID号,这是布尔值被分组的ID。它是一个字符类型编号(我假设这是错误信息引用的内容)
每个ID都应该是唯一的。以下是数据集的结构
str(train.agg.chkpt)
'data.frame': 823462 obs. of 12 variables:
$ ID_no : chr "AAAAAAA75465" "BBBBB175465" "CCCCCC75476" "DDDDD75476" ...
$ WC : int 1 0 0 1 0 0 0 1 0 1 ...
$ DF1 : int 0 1 1 0 0 0 0 0 0 0 ...
$ DF2 : int 0 0 0 0 1 1 1 0 1 0 ...
$ WCB14 : int 0 0 0 0 0 0 0 0 0 0 ...
$ WCA13 : int 0 0 0 0 0 0 0 0 0 0 ...
$ HN : int 0 0 0 0 0 0 0 0 0 0 ...
$ WCB13 : int 0 0 0 0 0 0 0 0 0 0 ...
$ WCA12 : int 0 0 0 0 0 0 0 0 0 0 ...
$ WCA14 : int 0 0 0 0 0 0 0 0 0 0 ...
$ WCB12 : int 0 0 0 0 0 0 0 0 0 0 ...
以下是traceback()
lapply(X = split(e, grp), FUN = FUN, ...)
4: FUN(X[[1L]], ...)
3: lapply(x, function(e) {
ans <- lapply(X = split(e, grp), FUN = FUN, ...)
if (simplify && length(len <- unique(sapply(ans, length))) ==
1L) {
if (len == 1L) {
cl <- lapply(ans, oldClass)
cl1 <- cl[[1L]]
ans <- unlist(ans, recursive = FALSE)
if (!is.null(cl1) && all(sapply(cl, function(x) identical(x,
cl1))))
class(ans) <- cl1
}
else if (len > 1L)
ans <- matrix(unlist(ans, recursive = FALSE), nrow = nry,
ncol = len, byrow = TRUE, dimnames = {
if (!is.null(nms <- names(ans[[1L]])))
list(NULL, nms)
else NULL
})
}
ans
})
2: aggregate.data.frame(train.agg.chkpt, by = list(ID_no = train.agg.chkpt$ID_no),
FUN = "sum")
1: aggregate(train.agg.chkpt, by = list(ID_no = train.agg.chkpt$ID_no),
FUN = "sum")
任何人都可以帮我理解错误信息吗?
感谢您的时间
答案 0 :(得分:4)
通过每个table
的简单Id
实施,您可以轻松获得所需的输出。这是一个可能的data.table
(您已经使用过)实现
library(data.table)
setDT(df)[, as.list(table(Event)), by = Id]
# Id Df Wc
# 1: A 1 1
# 2: B 1 0
# 3: C 1 0
或者,(如建议的那样)您可以使用简单的dcast
dcast(setDT(df), Id ~ Event, fun = length, value.var = "Event")
# Id Df Wc
# 1: A 1 1
# 2: B 1 0
# 3: C 1 0
或类似地
library(reshape2)
dcast(df, Id ~ Event, fun = length, value.var = "Event")
或使用tidyr
(参见下面的注意 )
library(tidyr)
df$indx <- 1
spread(df, Event, indx, fill = 0)
# Id Df Wc
# 1 A 1 1
# 2 B 1 0
# 3 C 1 0
或使用基础R中的reshape
(参见下面的注意 )
reshape(df, idvar = "Id", timevar = "Event", direction = "wide", v.names = "indx")
# Id indx.Wc indx.Df
# 1 A 1 1
# 2 B NA 1
# 3 C NA 1
spread
和reshape
在Id
具有相同Event
的情况下不会在此处工作}不止一次,因为他们没有fun.aggregate
参数,所以他们不知道如何处理它。<强>基准强>
library(microbenchmark)
set.seed(123)
n <- 1e7
df <- data.frame(Id = sample(LETTERS, n, replace = TRUE),
Event = sample(outer(LETTERS, letters, paste0), n, replace = TRUE))
dt <- copy(df)
DT1 <- function(x) setDT(x)[, as.list(table(Event)), by = Id]
DT2 <- function(x) dcast.data.table(setDT(x), Id ~ Event, fun = length, value.var = "Event")
RESHAPE2 <- function(x) dcast(x, Id ~ Event, fun = length, value.var = "Event")
microbenchmark(DT1(dt), DT2(dt), RESHAPE2(df))
# Unit: milliseconds
# expr min lq mean median uq max neval
# DT1(dt) 965.5181 987.8140 1017.8237 1007.1197 1030.7272 1285.9206 100
# DT2(dt) 406.7124 420.6203 446.8026 434.2489 455.4364 592.4333 100
# RESHAPE2(df) 2969.0057 3035.5817 3190.6514 3099.3221 3240.4642 4384.6316 100