我有两个数据框x
和weights
,其中的列已配对。以下是示例数据框:
x = read.table(text = "
yr1 yr2 yr3 yr4
10 15 6 8
10 20 30 NA
NA 5 2 3
100 100 NA NA",
sep = "", header = TRUE)
weights = read.table(text = "
yr1 yr2 yr3 yr4
2 4 1 3
2 2 4 2
3 2 2 3
4 2 2 4",
sep = "", header = TRUE)
列yr1
和yr2
是一对,列yr3
和yr4
是另一对。根据我的实际数据,列数最多为yr100
,并且有50对列。
如果yr1
中缺少yr2
或x
,我想填写缺失的观察结果,例如:
(5 / 2) * 3
同样适用于yr3
或yr4
:
(30 / 4) * 2
其中5(或30)是x
中列中的元素,对于给定的元素对,它不会丢失。第一个示例的值2和3(以及第二个示例中的值4和2)是weights
数据帧中给定元素对的x
数据帧中的对应元素。如果x
中缺少一对中的两个元素,我想让它们丢失。
以下是使用嵌套R
执行上述操作的for loops
代码。但是,我的实际数据集中有2000或3000行,嵌套的for loops
现在已经运行了> 10个小时。
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = NA
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = NA
}
}
我已经意识到第三和第四个if
语句可能不是必需的。如果我只删除那两个if
语句,那么运行此代码的时间可能会大大减少。
但是,我还想出了以下使用reshape
而不是嵌套for loops
的替代解决方案:
n.years <- 4
x2 <- reshape(x , direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
wt2 <- reshape(weights, direction="long", varying = list(seq(1,(n.years-1),2), seq(2,n.years,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x2$yr1 <- ifelse(is.na(x2$yr1), (x2$yr2 / wt2$yr2) * wt2$yr1, x2$yr1)
x2$yr2 <- ifelse(is.na(x2$yr2), (x2$yr1 / wt2$yr1) * wt2$yr2, x2$yr2)
x3 <- reshape(x2, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("yr1", "yr2"), times = c("t1", "t2"))
x3
在我关闭当前的R会话并尝试上述方法之一之前,请建议可能更有效的替代方案。我稍微使用了microbenchmark
,但还没有尝试过这样做,部分原因是为每个可能的解决方案编写一个函数对我来说有点吓人。我也尝试使用apply
系列函数提出解决方案,但无法提出解决方案。
我的reshape
解决方案来自这个问题:
Reshaping a data frame with more than one measure variable
除了计算时间,我还担心可能的内存耗尽。
我努力坚持使用基础R,但会考虑使用其他选项来获得所需的输出。感谢您的任何建议。
答案 0 :(得分:1)
这对你有用吗?
请注意,我没有使用您的替换函数,因为我发现它有点令人困惑,因此您必须修复如何使用公式替换yr1和yr2变量。此外,如果您需要将结果附加到原始数据框,则可能需要reshape
结果。
newx <-
reshape(x, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("v1","v2"))
newwt <-
reshape(weights, direction="long",varying=list(1:50*2-1,1:50*2), v.names=c("w1","w2"))
condwtmean <- function(x,y,wtx,wty){
if(xor(is.na(x),is.na(y))){
if(is.na(x))
x <- y # replacement function
if(is.na(y))
y <- x # replacement function
return(weighted.mean(c(x,y),c(wtx,wty)))
}
else if(!is.na(x) & !is.na(y))
return(weighted.mean(c(x,y),c(wtx,wty)))
else
return(NA)
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)
答案 1 :(得分:0)
microbenchmark
进行比较。我还没有尝试过托马斯对实际数据的回答。我原来的嵌套for循环方法在22小时后仍在运行。
Unit: milliseconds
expr min lq median uq max neval
fn.1(x, weights) 98.69133 99.47574 100.5313 101.7315 108.8757 20
fn.2(x, weights) 755.51583 758.12175 762.3775 776.0558 801.9615 20
fn.3(x, weights) 564.21423 567.98822 568.5322 571.0975 575.1809 20
fn.4(x, weights) 367.05862 370.52657 371.7439 373.7367 395.0423 20
#########################################################################################
# create data
set.seed(1234)
n.rows <- 40
n.cols <- 40
n.sample <- n.rows * n.cols
x <- sample(20, n.sample, replace=TRUE)
x.NA <- sample(n.rows*n.cols, 10*(n.sample / n.rows), replace=FALSE)
x[x.NA] <- NA
x <- as.data.frame(matrix(x, nrow = n.rows))
weights <- sample(4, n.sample, replace=TRUE)
weights <- as.data.frame(matrix(weights, nrow = n.rows))
weights
#########################################################################################
# Thomas's function
fn.1 <- function(x, weights){
newx <- reshape(x, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("v1", "v2"))
newwt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names=c("w1", "w2"))
condwtmean <- function(x,y,wtx,wty){
if(xor(is.na(x),is.na(y))){
if(is.na(x))
x <- (y / wty) * wtx # replacement function
if(is.na(y))
y <- (x / wtx) * wty # replacement function
return(weighted.mean(c(x,y),c(wtx,wty)))
}
else if(!is.na(x) & !is.na(y))
return(weighted.mean(c(x,y),c(wtx,wty)))
else
return(NA)
}
newx$wtmean <- mapply(condwtmean, newx$v1, newx$v2, newwt$w1, newwt$w2)
newx2 <- reshape(newx[,c(1,4:5)], v.names = "wtmean", timevar = "time", direction = "wide")
newx2 <- newx2[,2:(n.cols/2+1)]
names(newx2) <- paste('X', 1:(n.cols/2), sep = "")
return(newx2)
}
fn.1.output <- fn.1(x, weights)
#########################################################################################
# nested for-loops with 4 if statements
fn.2 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = NA
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = NA
}
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.2.output <- fn.2(x, weights)
fn.2.output <- as.data.frame(fn.2.output)
names(fn.2.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# nested for-loops with 2 if statements
fn.3 <- function(x, weights){
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x)) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 0)] = (x[j,(1 + ((i-1)*2 + 1))] / weights[j,(1 + ((i-1)*2 + 1))]) * weights[j,(1 + (i-1)*2 + 0)]
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) x[j,(1 + (i-1)*2 + 1)] = (x[j,(1 + ((i-1)*2 + 0))] / weights[j,(1 + ((i-1)*2 + 0))]) * weights[j,(1 + (i-1)*2 + 1)]
}
}
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.3.output <- fn.3(x, weights)
fn.3.output <- as.data.frame(fn.3.output)
names(fn.3.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
# my reshape solution
fn.4 <- function(x, weights){
new.x <- reshape(x , direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("v1", "v2"))
wt <- reshape(weights, direction="long", varying = list(seq(1,(n.cols-1),2), seq(2,n.cols,2)), v.names = c("w1", "w2"))
new.x$v1 <- ifelse(is.na(new.x$v1), (new.x$v2 / wt$w2) * wt$w1, new.x$v1)
new.x$v2 <- ifelse(is.na(new.x$v2), (new.x$v1 / wt$w1) * wt$w2, new.x$v2)
x2 <- reshape(new.x, direction="wide", varying = list(seq(1,3,2), seq(2,4,2)), v.names = c("v1", "v2"))
x <- x2[,2:(n.cols+1)]
x.weights = x * weights
numerator <- sapply(seq(1,ncol(x.weights),2), function(i) {
apply(x.weights[,c(i, i+1)], 1, sum, na.rm=T)
})
denominator <- sapply(seq(1,ncol(weights),2), function(i) {
apply(weights[,c(i, i+1)], 1, sum, na.rm=T)
})
weighted.x <- numerator/denominator
for(i in 1: (ncol(x)/2)) {
for(j in 1: nrow(x) ) {
if( is.na(x[j,(1 + (i-1)*2)]) & !is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if(!is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = sum(c(x[j,(1 + ((i-1)*2))], x[j,(1 + ((i-1)*2 + 1))]), na.rm = TRUE)
if( is.na(x[j,(1 + (i-1)*2)]) & is.na(x[j,(1 + (i-1)*2 + 1)])) weighted.x[j,i] = NA
}
}
return(weighted.x)
}
fn.4.output <- fn.4(x, weights)
fn.4.output <- as.data.frame(fn.4.output)
names(fn.4.output) <- paste('X', 1:(n.cols/2), sep = "")
#########################################################################################
rownames(fn.1.output) <- NULL
rownames(fn.2.output) <- NULL
rownames(fn.3.output) <- NULL
rownames(fn.4.output) <- NULL
all.equal(fn.1.output, fn.2.output)
all.equal(fn.1.output, fn.3.output)
all.equal(fn.1.output, fn.4.output)
all.equal(fn.2.output, fn.3.output)
all.equal(fn.2.output, fn.4.output)
all.equal(fn.3.output, fn.4.output)
library(microbenchmark)
microbenchmark(fn.1(x, weights), fn.2(x, weights), fn.3(x, weights), fn.4(x, weights), times=20)
#########################################################################################