前段时间我编写了一个函数来填充时间序列矩阵,根据所需的规范填充NA值,并且它在大约50000行,350列的几个矩阵上具有一些用途。矩阵可以包含数字或字符值。主要的问题是修复矩阵的速度很慢,我想我会评估一些专家如何更快地完成这项工作。
我想去rcpp或平行它可能有所帮助,但我认为这可能是我的设计而不是R本身效率低下。我通常在R中对所有内容进行处理,但由于缺失的值没有遵循任何模式,我发现除了基于每行的矩阵之外没有别的办法。
需要调用该函数,以便它可以转发缺失值,并且还可以调用以快速填充最后一个已知值的最新值。
以下是一个示例矩阵:
testMatrix <- structure(c(NA, NA, NA, 29.98, 66.89, NA, -12.78, -11.65, NA,
4.03, NA, NA, NA, 29.98, 66.89, NA, -12.78, -11.65, NA, NA, NA,
NA, NA, 29.98, 66.89, NA, -12.78, NA, NA, 4.76, NA, NA, NA, NA,
66.89, NA, -12.78, NA, NA, 4.76, NA, NA, NA, 29.98, 66.89, NA,
-12.78, NA, NA, 4.76, NA, NA, NA, 29.98, 66.89, NA, -12.78, NA,
NA, 4.39, NA, NA, NA, 29.98, 66.89, NA, -10.72, -11.65, NA, 4.39,
NA, NA, NA, 29.98, 50.65, NA, -10.72, -11.65, NA, 4.39, NA, NA,
4.72, NA, 50.65, NA, -10.72, -38.61, 45.3, NA), .Dim = c(10L,
9L), .Dimnames = list(c("ID_a", "ID_b", "ID_c", "ID_d", "ID_e",
"ID_f", "ID_g", "ID_h", "ID_i", "ID_j"), c("2010-09-30", "2010-10-31",
"2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28", "2011-03-31",
"2011-04-30", "2011-05-31")))
print(testMatrix)
2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a NA NA NA NA NA NA NA NA NA
ID_b NA NA NA NA NA NA NA NA NA
ID_c NA NA NA NA NA NA NA NA 4.72
ID_d 29.98 29.98 29.98 NA 29.98 29.98 29.98 29.98 NA
ID_e 66.89 66.89 66.89 66.89 66.89 66.89 66.89 50.65 50.65
ID_f NA NA NA NA NA NA NA NA NA
ID_g -12.78 -12.78 -12.78 -12.78 -12.78 -12.78 -10.72 -10.72 -10.72
ID_h -11.65 -11.65 NA NA NA NA -11.65 -11.65 -38.61
ID_i NA NA NA NA NA NA NA NA 45.30
ID_j 4.03 NA 4.76 4.76 4.76 4.39 4.39 4.39 NA
这是我目前使用的功能:
# ----------------------------------------------------------------------------
# GetMatrixWithBlanksFilled
# ----------------------------------------------------------------------------
#
# Arguments:
# inputMatrix --- A matrix with gaps in the time series rows
# fillGapMax --- The max number of columns to carry a number
# forward if there are no more values in the
# time series row.
#
# Returns:
# A matrix with gaps filled.
GetMatrixWithBlanksFilled <- function(inputMatrix, fillGapMax = 6, forwardLooking = TRUE) {
if("DEBUG_ON" %in% ls(globalenv())){browser()}
cntRow <- nrow(inputMatrix)
cntCol <- ncol(inputMatrix)
#
if (forwardLooking) {
for (i in 1:cntRow) {
# Store the location of the first non NA element in the row
firstValueCol <- (1:cntCol)[!is.na(inputMatrix[i,])][1]
if (!(is.na(firstValueCol))) {
if (!(firstValueCol == cntCol)) {
nextValueCol <- firstValueCol
# If there is a a value number in the row and it's not at the end of the time
# series, start iterating through the row while there are more NA values and
# more data values and not at the end of the row continue.
while ((sum(as.numeric(is.na(inputMatrix[i,nextValueCol:cntCol]))))>0 && (sum(as.numeric(!is.na(inputMatrix[i,nextValueCol:cntCol]))))>0 && !(nextValueCol == cntCol)) {
# Find the next NA element
nextNaCol <- (nextValueCol:cntCol)[is.na(inputMatrix[i,nextValueCol:cntCol])][1]
# Find the next value element
nextValueCol <- (nextNaCol:cntCol)[!is.na(inputMatrix[i,nextNaCol:cntCol])][1]
# If there is another value element then fill up all NA elements in between with the last known value
if (!is.na(nextValueCol)) {
inputMatrix[i,nextNaCol:(nextValueCol-1)] <- inputMatrix[i,(nextNaCol-1)]
} else {
# If there is no other value element then fill up all NA elements up to the max number supplied
# with the last known value unless it's close to the end of the row then just fill up to the end.
inputMatrix[i,nextNaCol:min(nextNaCol+fillGapMax,cntCol)] <- inputMatrix[i,(nextNaCol-1)]
nextValueCol <- cntCol
}
}
}
}
}
} else {
for (i in 1:cntRow) {
if (is.na(inputMatrix[i,ncol(inputMatrix)])) {
tempRow <- inputMatrix[i,max(1,length(inputMatrix[i,])-fillGapMax):length(inputMatrix[i,])]
if (length(tempRow[!is.na(tempRow)])>0) {
lastNonNaLocation <- (length(tempRow):1)[!is.na(tempRow)][length(tempRow[!is.na(tempRow)])]
inputMatrix[i,(ncol(inputMatrix)-lastNonNaLocation+2):ncol(inputMatrix)] <- tempRow[!is.na(tempRow)][length(tempRow[!is.na(tempRow)])]
}
}
}
}
return(inputMatrix)
}
然后用类似的东西来调用它:
> fixedMatrix1 <- GetMatrixWithBlanksFilled(testMatrix,fillGapMax=12,forwardLooking=TRUE)
> print(fixedMatrix1)
2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a NA NA NA NA NA NA NA NA NA
ID_b NA NA NA NA NA NA NA NA NA
ID_c NA NA NA NA NA NA NA NA 4.72
ID_d 29.98 29.98 29.98 29.98 29.98 29.98 29.98 29.98 29.98
ID_e 66.89 66.89 66.89 66.89 66.89 66.89 66.89 50.65 50.65
ID_f NA NA NA NA NA NA NA NA NA
ID_g -12.78 -12.78 -12.78 -12.78 -12.78 -12.78 -10.72 -10.72 -10.72
ID_h -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -38.61
ID_i NA NA NA NA NA NA NA NA 45.30
ID_j 4.03 4.03 4.76 4.76 4.76 4.39 4.39 4.39 4.39
或
> fixedMatrix2 <- GetMatrixWithBlanksFilled(testMatrix,fillGapMax=1,forwardLooking=FALSE)
> print(fixedMatrix2)
2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a NA NA NA NA NA NA NA NA NA
ID_b NA NA NA NA NA NA NA NA NA
ID_c NA NA NA NA NA NA NA NA 4.72
ID_d 29.98 29.98 29.98 NA 29.98 29.98 29.98 29.98 29.98
ID_e 66.89 66.89 66.89 66.89 66.89 66.89 66.89 50.65 50.65
ID_f NA NA NA NA NA NA NA NA NA
ID_g -12.78 -12.78 -12.78 -12.78 -12.78 -12.78 -10.72 -10.72 -10.72
ID_h -11.65 -11.65 NA NA NA NA -11.65 -11.65 -38.61
ID_i NA NA NA NA NA NA NA NA 45.30
ID_j 4.03 NA 4.76 4.76 4.76 4.39 4.39 4.39 4.39
这个例子运行得很快,但是有没有办法让它对大型矩阵更快?
> n <- 38
> m <- 5000
> bigM <- matrix(rep(testMatrix,n*m),m*nrow(testMatrix),n*ncol(testMatrix),FALSE)
> system.time(output <- GetMatrixWithBlanksFilled(bigM,fillGapMax=12,forwardLooking=TRUE))
user system elapsed
86.47 0.06 87.24
这个假人有很多NA行和完全填充的行,但正常行可能需要大约15-20分钟。
更新
关于Charles关于na.locf的评论没有完全反映上述逻辑:下面是最终函数如何排除输入检查等的简化版本:
FillGaps <- function( dataMatrix, fillGapMax ) {
require("zoo")
numRow <- nrow(dataMatrix)
numCol <- ncol(dataMatrix)
iteration <- (numCol-fillGapMax)
if(length(iteration)>0) {
for (i in iteration:1) {
tempMatrix <- dataMatrix[,i:(i+fillGapMax),drop=FALSE]
tempMatrix <- t(zoo::na.locf(t(tempMatrix), na.rm=FALSE, maxgap=fillGapMax))
dataMatrix[,i:(i+fillGapMax)] <- tempMatrix
}
}
return(dataMatrix)
}
答案 0 :(得分:6)
我可能错了,但我认为这是在zoo包中实现的:使用na.locf
函数。
使用给定的示例矩阵,首先我们应该转置它,并在调用na
函数后,我们'重新转换'结果矩阵。例如:
> t(na.locf(t(testMatrix), na.rm=FALSE, maxgap=12))
2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a NA NA NA NA NA NA NA NA NA
ID_b NA NA NA NA NA NA NA NA NA
ID_c NA NA NA NA NA NA NA NA 4.72
ID_d 29.98 29.98 29.98 29.98 29.98 29.98 29.98 29.98 29.98
ID_e 66.89 66.89 66.89 66.89 66.89 66.89 66.89 50.65 50.65
ID_f NA NA NA NA NA NA NA NA NA
ID_g -12.78 -12.78 -12.78 -12.78 -12.78 -12.78 -10.72 -10.72 -10.72
ID_h -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -11.65 -38.61
ID_i NA NA NA NA NA NA NA NA 45.30
ID_j 4.03 4.03 4.76 4.76 4.76 4.39 4.39 4.39 4.39
小maxgap
:
> t(na.locf(t(testMatrix), na.rm=FALSE, maxgap=0))
2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a NA NA NA NA NA NA NA NA NA
ID_b NA NA NA NA NA NA NA NA NA
ID_c NA NA NA NA NA NA NA NA 4.72
ID_d 29.98 29.98 29.98 NA 29.98 29.98 29.98 29.98 NA
ID_e 66.89 66.89 66.89 66.89 66.89 66.89 66.89 50.65 50.65
ID_f NA NA NA NA NA NA NA NA NA
ID_g -12.78 -12.78 -12.78 -12.78 -12.78 -12.78 -10.72 -10.72 -10.72
ID_h -11.65 -11.65 NA NA NA NA -11.65 -11.65 -38.61
ID_i NA NA NA NA NA NA NA NA 45.30
ID_j 4.03 NA 4.76 4.76 4.76 4.39 4.39 4.39 NA
可以看到使用na.locf
获得的效果:
> system.time(output <- GetMatrixWithBlanksFilled(bigM,fillGapMax=12,forwardLooking=TRUE))
user system elapsed
79.238 0.540 80.398
> system.time(output <- t(na.locf(t(bigM), na.rm=FALSE, maxgap=12)))
user system elapsed
17.129 0.267 17.513