我在R中有一个数据帧列表,每个数据帧都是从包含排名的不同文件加载的。例如,文件可以包含不同种族的不同运动员的完成位置。
相同的元素(运动员)可以出现在多个数据框(种族)中,但没有数据框必然包含所有元素。
我想在运动员的行和比赛中填充排名矩阵。如果在特定比赛中没有运动员的排名,则应该为0。
例如,如果我有:
[[1]]
name rank
1 Alice 1
2 Bob 2
3 Carla 3
4 Diego 4
[[2]]
name rank
1 Alice 2
2 Carla 1
3 Eric 3
4 Frank 4
5 Gary 5
[[3]]
name rank
1 Bob 5
2 Carla 4
3 Diego 3
4 Eric 1
5 Gary 2
我想生成一个矩阵:
1 2 3
Alice 1 2 0
Bob 2 0 5
Carla 3 1 4
Diego 4 0 3
Eric 0 3 1
Frank 0 4 0
Gary 0 5 2
我正在寻找一种有效的方法:我的数据更像200个数据帧和每个数据帧10000个排名元素(总共15000个独特元素),因此最终矩阵将大约为15000x200
答案 0 :(得分:2)
以下是使用reshape2
包的解决方案:
require(reshape2)
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
其中ll
是您data.frame
s的列表。
或等效地:
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
data.table
解决方案:
require(data.table)
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:3)][is.na(rank), rank := 0L][, as.list(rank), by = name]
name V1 V2 V3
1: Alice 1 2 0
2: Bob 2 0 5
3: Carla 3 1 4
4: Diego 4 0 3
5: Eric 0 3 1
6: Frank 0 4 0
7: Gary 0 5 2
一些基准测试(现在我们已经得到了很多答案):
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
require(reshape2)
require(data.table)
Arun1_reshape2 <- function(ll) {
# same as @agstudy's
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
}
Arun2_reshape2 <- function(ll) {
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
}
eddi_reshape2 <- function(ll) {
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
}
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
merge.all <- function(x, y) {
merge(x, y, all=TRUE, by="name")
}
Hong_Ooi <- function(ll) {
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
}
require(microbenchmark)
microbenchmark( arun1 <- Arun1_reshape2(ll),
arun2 <- Arun2_reshape2(ll),
eddi <- eddi_reshape2(ll),
hong <- Hong_Ooi(ll),
arun.dt <- Arun_data.table(ll), times=10)
Unit: seconds
expr min lq median uq max neval
arun1 <- Arun1_reshape2(ll) 9.157160 9.177143 9.366775 9.715767 28.043125 10
arun2 <- Arun2_reshape2(ll) 8.408356 8.437066 8.494233 9.018796 10.075029 10
eddi <- eddi_reshape2(ll) 8.056605 8.314110 8.402396 8.474129 9.124581 10
hong <- Hong_Ooi(ll) 82.457432 82.716930 82.908646 108.413217 321.164598 10
arun.dt <- Arun_data.table(ll) 2.006474 2.123331 2.212783 2.311619 2.738914 10
答案 1 :(得分:2)
这是一个更简单的reshape2
解决方案:
library(reshape2)
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
# name 1 2 3
#1 Alice 1 2 0
#2 Bob 2 0 5
#3 Carla 3 1 4
#4 Diego 4 0 3
#5 Eric 0 3 1
#6 Frank 0 4 0
#7 Gary 0 5 2
Arun的基准测试非常有趣,似乎data.table
确实很好的是融化部分,而reshape2
真正做得很好的是dcast
,所以这里是最好的两个世界:
library(reshape2)
library(data.table)
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
使用Arun的基准数据:
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
mix_of_both = function(ll) {
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
}
require(microbenchmark)
microbenchmark(Arun_data.table(ll), mix_of_both(ll), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# Arun_data.table(ll) 2568.333 2586.0079 2626.7704 2832.8076 2911.1314 10
# mix_of_both(ll) 615.166 739.9383 766.8994 788.5822 821.0478 10
答案 2 :(得分:1)
这里自OP以来的数据没有提供可重复的例子:
dput(ll)
list(structure(list(name = structure(1:4, .Label = c("Alice",
"Bob", "Carla", "Diego"), class = "factor"), rank = 1:4), .Names = c("name",
"rank"), class = "data.frame", row.names = c("1", "2", "3", "4"
)), structure(list(name = structure(1:5, .Label = c("Alice",
"Carla", "Eric", "Frank", "Gary"), class = "factor"), rank = c(2L,
1L, 3L, 4L, 5L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")), structure(list(name = structure(1:5, .Label = c("Bob",
"Carla", "Diego", "Eric", "Gary"), class = "factor"), rank = c(5L,
4L, 3L, 1L, 2L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")))
与@Arun one几乎相同的解决方案,但是在2个separtes步骤中:
## add race column
ll <- lapply(seq_along(ll),function(x){
ll[[x]]$race <- x
ll[[x]]
})
## create a long data.frame
dd <- do.call(rbind,ll)
## transform to the wide format
library(reshape2)
dcast(name~race,data=dd,fill=0,value.var='rank')
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
答案 3 :(得分:1)
似乎是另一个Reduce
用例。
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="name")
# to avoid problems with merged name clashes
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
您必须稍微修改数据框,以避免merge
抱怨名称冲突; for
循环与此目的一样有效。
任何缺席的比赛都有NA。你可以用out[is.na(out)] <- 0
替换为0;你应该问问自己这是否合情合理。例如,如果您这样做,那么简单的汇总统计信息(如均值,差异等)将产生误导性结果。如果您想进行更复杂的建模,情况也是如此。相比之下,大多数R建模功能都足够智能,可以排除NA。