我有一个data.frame tr_id_cat由两列组成:id,category。
我想构建一个计数矩阵:
我的代码到目前为止,但需要很长时间:
# id and categories are lists of the possible values (dictionary)
nb_id = length(id)
nb_categories = length(categories)
COUNT_ID_CATEGORY = array( 0, dim=c(nb_id,nb_categories) )
for(i in (1:nb_categories))
{
cat_ = categories[i]
subs = tr_id_cat[ tr_id_cat$category == cat_ ,]
for(j in(1:dim(subs)[1]))
{
id_ = subs$id[j]
id_idx = which(id == id_)
COUNT_ID_CATEGORY[id_idx,cat_idx] = dim(subs[ subs$id == id_,])[1];
}
}
我想要做的小版本:
id, category
1, 1
1, 1
1, 1
1, 2
1, 2
2, 1
3, 1
将转换为计数矩阵:
COUNT_ID_CATEGORY[1,1] = 3 # first three lines
COUNT_ID_CATEGORY[1,2] = 2 # line 4 and 5
COUNT_ID_CATEGORY[2,1] = 1
COUNT_ID_CATEGORY[2,2] = 0
COUNT_ID_CATEGORY[3,1] = 1
COUNT_ID_CATEGORY[3,2] = 0
etc
答案 0 :(得分:2)
如果我理解你的问题你想获得类似的东西
library(dplyr)
tr_id_cat %.% group_by(id,category) %.% summarise(n=n())
答案 1 :(得分:2)
使用data.table
和reshape2
的解决方案。
library(data.table)
library(reshape2)
# Number of rows
n <- 15e6
# Generate test data
tr_id_cat <- data.table(id = ceiling(runif(n) * 300e3),
category = ceiling(runif(n) * 20))
# set keys
setkey(tr_id_cat, id, category)
# count
tab <- tr_id_cat[, .N, keyby = list(id, category)]
# reshape as array
dcast.data.table(tab, id ~ category, value.var = "N", fill = 0L)
我在这里添加了@Arun的解决方案。我也添加了时间结果。似乎Arun的解决方案需要大约20%的时间。
library(data.table)
library(reshape2)
library(rbenchmark)
f1 <- function(n = 15e6) {
tr_id_cat <- data.table(id = ceiling(runif(n) * 300e3),
category = ceiling(runif(n) * 20))
setkey(tr_id_cat, id, category)
tab <- tr_id_cat[, .N, keyby = list(id, category)]
dcast.data.table(tab, id ~ category, value.var = "N", fill = 0L)
}
f2 <- function(n = 15e6) {
tr_id_cat <- data.table(id = ceiling(runif(n) * 300e3),
category = ceiling(runif(n) * 20))
tab <- tr_id_cat[, .N, by = list(id, category)]
dcast.data.table(tab, id ~ category, value.var = "N", fill = 0L)
}
benchmark(f1(), f2(),
columns = c("test", "replications", "elapsed", "relative"),
replications = rep(10, 3))
我的机器上的结果:
test replications elapsed relative
1 f1() 10 61.62 1.000
3 f1() 10 62.21 1.010
5 f1() 10 61.60 1.000
2 f2() 10 73.55 1.194
4 f2() 10 74.31 1.206
6 f2() 10 73.33 1.190
答案 2 :(得分:-1)
这么简单吗?
d <- data.frame(category = rep(c(1:3), 3),
id = rep(c(1:3), 3))
d$tr_id_cat <- with(d, paste0(category, id))
library(plyr)
ddply(d, .(tr_id_cat), summarise, n=length(tr_id_cat))
或者,d不带tr_id_cat变量
plyr::count(d, .(category, id))
或使用dplyr
library(dplyr)
d %.%
dplyr:::group_by(tr_id_cat) %.%
dplyr:::summarise(n = length(tr_id_cat))