在R中,我试图在众多数据文件中创建元素的计数矩阵:
rnames <- c("N","A")
mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames))
#loop through hundreds of large files (MB)
#make the vector "names" contain all elements within each file
for(name in names)
{
#if name is already in the matrix increment by 1 the second row
if(name %in% colnames(mymatrix))
{
mymatrix[2,name] = mymatrix[2,name]+1
}
#else add a column to the matrix with the specified name
else
{
mymatrix <- transform(mymatrix,name)
mymatrix[2,name] = 1
}
}
我运行了Rprof命令,发现可能嵌入%in%运算符中的match()函数导致性能问题(执行时间更长)
如果向量中的每个元素存在于我的矩阵中,它是否有更有效的方法来检查它,如果它没有在矩阵中用该向量元素作为列名创建新列?< / p>
如果你想要一个可重现的代码,那么......但请记住,我的原始代码中的名称向量是从包含数千个变量的大文件中读取的,这些变量与mymatrix中不断增加的列号相匹配,最终导致运行时间增加:
rnames <- c("N","A")
mymatrix <- matrix(nrow=2,ncol=0,dimnames=list(rnames))
#suppose this is what the first file contains
names <- c("x","y","z","x","x","y","a")
#suppose this is what the second file contains
names <- c("x","y","z","x","x","x","x","k")
for(name in names)
{
if(name %in% colnames(mymatrix))
{
mymatrix[2,name] = mymatrix[2,name] + 1
}
else
{
mymatrix <- transform(mymatrix,name)
mymatrix[2,name] = 1
}
}
the expected output
> mymatrix
x y z a k
N NA NA NA NA NA
A 8 3 2 1 1
答案 0 :(得分:1)
我不知道你是如何确定match
是瓶颈的。它可能是,但您提供的示例并未显示。
rnames <- c("N","A")
mymatrix <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
if(name %in% colnames(mymatrix)) {
mymatrix[2,name] <- mymatrix[2,name] + 1
} else {
mymatrix <- transform(mymatrix,name)
mymatrix[2,name] <- 1
}
}
Rprof(NULL)
下面的结果显示瓶颈是data.frame方法,由于您使用transform
而调用这些方法。 transform.default
会将您的矩阵转换为data.frame,然后调用transform.data.frame
,其中包括对match
的调用。
R> lapply(summaryRprof(), head)
$by.self
self.time self.pct total.time total.pct
"[<-.data.frame" 12.02 26.15 25.90 56.35
"[.data.frame" 7.22 15.71 13.32 28.98
"match" 7.20 15.67 11.40 24.80
"%in%" 2.38 5.18 12.34 26.85
"anyDuplicated" 2.22 4.83 3.08 6.70
"names" 2.16 4.70 2.16 4.70
$by.total
total.time total.pct self.time self.pct
"[<-" 27.06 58.88 1.16 2.52
"[<-.data.frame" 25.90 56.35 12.02 26.15
"[" 14.32 31.16 1.00 2.18
"[.data.frame" 13.32 28.98 7.22 15.71
"%in%" 12.34 26.85 2.38 5.18
"match" 11.40 24.80 7.20 15.67
$sample.interval
[1] 0.02
$sampling.time
[1] 45.96
避免transform
调用,您的代码会明显加快。 mymatrix2
实际上是一个矩阵,而mymatrix
是一个data.frame。
rnames <- c("N","A")
mymatrix2 <- matrix(nrow=2, ncol=0, dimnames=list(rnames))
set.seed(21)
names <- sample(letters, 1e6, TRUE)
Rprof()
for(name in names) {
if(name %in% colnames(mymatrix)) {
mymatrix2[2,name] <- mymatrix2[2,name] + 1
} else {
mymatrix2 <- cbind(mymatrix2, matrix(c(NA,1), 2, 1, dimnames=list(rnames, name)))
}
}
Rprof(NULL)
lapply(summaryRprof(), head)
$by.self
self.time self.pct total.time total.pct
"match" 1.28 41.83 2.70 88.24
"colnames" 0.78 25.49 1.42 46.41
"is.data.frame" 0.58 18.95 0.58 18.95
"%in%" 0.34 11.11 3.04 99.35
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
$by.total
total.time total.pct self.time self.pct
"%in%" 3.04 99.35 0.34 11.11
"match" 2.70 88.24 1.28 41.83
"colnames" 1.42 46.41 0.78 25.49
"is.data.frame" 0.58 18.95 0.58 18.95
"dimnames" 0.06 1.96 0.06 1.96
"+" 0.02 0.65 0.02 0.65
identical(mymatrix2, as.matrix(mymatrix))
[1] TRUE