Question

前段时间我编写了一个函数来填充时间序列矩阵，根据所需的规范填充NA值，并且它在大约50000行，350列的几个矩阵上具有一些用途。矩阵可以包含数字或字符值。主要的问题是修复矩阵的速度很慢，我想我会评估一些专家如何更快地完成这项工作。

我想去rcpp或平行它可能有所帮助，但我认为这可能是我的设计而不是R本身效率低下。我通常在R中对所有内容进行处理，但由于缺失的值没有遵循任何模式，我发现除了基于每行的矩阵之外没有别的办法。

需要调用该函数，以便它可以转发缺失值，并且还可以调用以快速填充最后一个已知值的最新值。

以下是一个示例矩阵：

testMatrix <- structure(c(NA, NA, NA, 29.98, 66.89, NA, -12.78, -11.65, NA, 
 4.03, NA, NA, NA, 29.98, 66.89, NA, -12.78, -11.65, NA, NA, NA, 
 NA, NA, 29.98, 66.89, NA, -12.78, NA, NA, 4.76, NA, NA, NA, NA, 
 66.89, NA, -12.78, NA, NA, 4.76, NA, NA, NA, 29.98, 66.89, NA, 
 -12.78, NA, NA, 4.76, NA, NA, NA, 29.98, 66.89, NA, -12.78, NA, 
 NA, 4.39, NA, NA, NA, 29.98, 66.89, NA, -10.72, -11.65, NA, 4.39, 
 NA, NA, NA, 29.98, 50.65, NA, -10.72, -11.65, NA, 4.39, NA, NA, 
 4.72, NA, 50.65, NA, -10.72, -38.61, 45.3, NA), .Dim = c(10L, 
 9L), .Dimnames = list(c("ID_a", "ID_b", "ID_c", "ID_d", "ID_e", 
 "ID_f", "ID_g", "ID_h", "ID_i", "ID_j"), c("2010-09-30", "2010-10-31", 
 "2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28", "2011-03-31", 
 "2011-04-30", "2011-05-31")))

print(testMatrix)
     2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_b         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_c         NA         NA         NA         NA         NA         NA         NA         NA       4.72
ID_d      29.98      29.98      29.98         NA      29.98      29.98      29.98      29.98         NA
ID_e      66.89      66.89      66.89      66.89      66.89      66.89      66.89      50.65      50.65
ID_f         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_g     -12.78     -12.78     -12.78     -12.78     -12.78     -12.78     -10.72     -10.72     -10.72
ID_h     -11.65     -11.65         NA         NA         NA         NA     -11.65     -11.65     -38.61
ID_i         NA         NA         NA         NA         NA         NA         NA         NA      45.30
ID_j       4.03         NA       4.76       4.76       4.76       4.39       4.39       4.39         NA

这是我目前使用的功能：

# ----------------------------------------------------------------------------
# GetMatrixWithBlanksFilled
# ----------------------------------------------------------------------------
#
# Arguments:
# inputMatrix --- A matrix with gaps in the time series rows
# fillGapMax  --- The max number of columns to carry a number
#                 forward if there are no more values in the
#                 time series row.
#
# Returns:
# A matrix with gaps filled.

GetMatrixWithBlanksFilled <- function(inputMatrix, fillGapMax = 6, forwardLooking = TRUE) {

    if("DEBUG_ON" %in% ls(globalenv())){browser()}

    cntRow <- nrow(inputMatrix)
    cntCol <- ncol(inputMatrix)

    # 
    if (forwardLooking) {
        for (i in 1:cntRow) {
            # Store the location of the first non NA element in the row
            firstValueCol <- (1:cntCol)[!is.na(inputMatrix[i,])][1]
            if (!(is.na(firstValueCol))) {
                if (!(firstValueCol == cntCol)) {
                    nextValueCol <- firstValueCol
                    # If there is a a value number in the row and it's not at the end of the time
                    # series, start iterating through the row while there are more NA values and
                    # more data values and not at the end of the row continue.
                    while ((sum(as.numeric(is.na(inputMatrix[i,nextValueCol:cntCol]))))>0 && (sum(as.numeric(!is.na(inputMatrix[i,nextValueCol:cntCol]))))>0 && !(nextValueCol == cntCol)) {
                        # Find the next NA element
                        nextNaCol <- (nextValueCol:cntCol)[is.na(inputMatrix[i,nextValueCol:cntCol])][1]
                        # Find the next value element
                        nextValueCol <- (nextNaCol:cntCol)[!is.na(inputMatrix[i,nextNaCol:cntCol])][1]
                        # If there is another value element then fill up all NA elements in between with the last known value
                        if (!is.na(nextValueCol)) {
                            inputMatrix[i,nextNaCol:(nextValueCol-1)] <- inputMatrix[i,(nextNaCol-1)]
                        } else {
                            # If there is no other value element then fill up all NA elements up to the max number supplied
                            # with the last known value unless it's close to the end of the row then just fill up to the end.
                            inputMatrix[i,nextNaCol:min(nextNaCol+fillGapMax,cntCol)] <- inputMatrix[i,(nextNaCol-1)]
                            nextValueCol <- cntCol
                        }
                    }
                }
            }
        }
    } else {
        for (i in 1:cntRow) {
            if (is.na(inputMatrix[i,ncol(inputMatrix)])) {
                tempRow <- inputMatrix[i,max(1,length(inputMatrix[i,])-fillGapMax):length(inputMatrix[i,])]
                if (length(tempRow[!is.na(tempRow)])>0) {
                    lastNonNaLocation <- (length(tempRow):1)[!is.na(tempRow)][length(tempRow[!is.na(tempRow)])]
                    inputMatrix[i,(ncol(inputMatrix)-lastNonNaLocation+2):ncol(inputMatrix)] <- tempRow[!is.na(tempRow)][length(tempRow[!is.na(tempRow)])]
                }
            }
        }
    }

    return(inputMatrix)
}

然后用类似的东西来调用它：

> fixedMatrix1 <- GetMatrixWithBlanksFilled(testMatrix,fillGapMax=12,forwardLooking=TRUE)
> print(fixedMatrix1)
     2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_b         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_c         NA         NA         NA         NA         NA         NA         NA         NA       4.72
ID_d      29.98      29.98      29.98      29.98      29.98      29.98      29.98      29.98      29.98
ID_e      66.89      66.89      66.89      66.89      66.89      66.89      66.89      50.65      50.65
ID_f         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_g     -12.78     -12.78     -12.78     -12.78     -12.78     -12.78     -10.72     -10.72     -10.72
ID_h     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -38.61
ID_i         NA         NA         NA         NA         NA         NA         NA         NA      45.30
ID_j       4.03       4.03       4.76       4.76       4.76       4.39       4.39       4.39       4.39

或

> fixedMatrix2 <- GetMatrixWithBlanksFilled(testMatrix,fillGapMax=1,forwardLooking=FALSE)
> print(fixedMatrix2)
     2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_b         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_c         NA         NA         NA         NA         NA         NA         NA         NA       4.72
ID_d      29.98      29.98      29.98         NA      29.98      29.98      29.98      29.98      29.98
ID_e      66.89      66.89      66.89      66.89      66.89      66.89      66.89      50.65      50.65
ID_f         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_g     -12.78     -12.78     -12.78     -12.78     -12.78     -12.78     -10.72     -10.72     -10.72
ID_h     -11.65     -11.65         NA         NA         NA         NA     -11.65     -11.65     -38.61
ID_i         NA         NA         NA         NA         NA         NA         NA         NA      45.30
ID_j       4.03         NA       4.76       4.76       4.76       4.39       4.39       4.39       4.39

这个例子运行得很快，但是有没有办法让它对大型矩阵更快？

> n <- 38
> m <- 5000
> bigM <- matrix(rep(testMatrix,n*m),m*nrow(testMatrix),n*ncol(testMatrix),FALSE)
> system.time(output <- GetMatrixWithBlanksFilled(bigM,fillGapMax=12,forwardLooking=TRUE))
   user  system elapsed 
  86.47    0.06   87.24

这个假人有很多NA行和完全填充的行，但正常行可能需要大约15-20分钟。

更新

关于Charles关于na.locf的评论没有完全反映上述逻辑：下面是最终函数如何排除输入检查等的简化版本：

FillGaps <- function( dataMatrix, fillGapMax ) {

    require("zoo")

    numRow <- nrow(dataMatrix) 
    numCol <- ncol(dataMatrix) 

    iteration <- (numCol-fillGapMax)

    if(length(iteration)>0) {
        for (i in iteration:1) {
            tempMatrix <- dataMatrix[,i:(i+fillGapMax),drop=FALSE]
            tempMatrix <- t(zoo::na.locf(t(tempMatrix), na.rm=FALSE, maxgap=fillGapMax))
            dataMatrix[,i:(i+fillGapMax)] <- tempMatrix
        }
    }

    return(dataMatrix)
}

Answer 1

我可能错了，但我认为这是在zoo包中实现的：使用na.locf函数。

使用给定的示例矩阵，首先我们应该转置它，并在调用na函数后，我们'重新转换'结果矩阵。例如：

> t(na.locf(t(testMatrix), na.rm=FALSE, maxgap=12))
     2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_b         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_c         NA         NA         NA         NA         NA         NA         NA         NA       4.72
ID_d      29.98      29.98      29.98      29.98      29.98      29.98      29.98      29.98      29.98
ID_e      66.89      66.89      66.89      66.89      66.89      66.89      66.89      50.65      50.65
ID_f         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_g     -12.78     -12.78     -12.78     -12.78     -12.78     -12.78     -10.72     -10.72     -10.72
ID_h     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -11.65     -38.61
ID_i         NA         NA         NA         NA         NA         NA         NA         NA      45.30
ID_j       4.03       4.03       4.76       4.76       4.76       4.39       4.39       4.39       4.39

小maxgap：

> t(na.locf(t(testMatrix), na.rm=FALSE, maxgap=0))
     2010-09-30 2010-10-31 2010-11-30 2010-12-31 2011-01-31 2011-02-28 2011-03-31 2011-04-30 2011-05-31
ID_a         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_b         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_c         NA         NA         NA         NA         NA         NA         NA         NA       4.72
ID_d      29.98      29.98      29.98         NA      29.98      29.98      29.98      29.98         NA
ID_e      66.89      66.89      66.89      66.89      66.89      66.89      66.89      50.65      50.65
ID_f         NA         NA         NA         NA         NA         NA         NA         NA         NA
ID_g     -12.78     -12.78     -12.78     -12.78     -12.78     -12.78     -10.72     -10.72     -10.72
ID_h     -11.65     -11.65         NA         NA         NA         NA     -11.65     -11.65     -38.61
ID_i         NA         NA         NA         NA         NA         NA         NA         NA      45.30
ID_j       4.03         NA       4.76       4.76       4.76       4.39       4.39       4.39         NA

可以看到使用na.locf获得的效果：

>  system.time(output <- GetMatrixWithBlanksFilled(bigM,fillGapMax=12,forwardLooking=TRUE))
   user  system elapsed 
 79.238   0.540  80.398 
> system.time(output <- t(na.locf(t(bigM), na.rm=FALSE, maxgap=12)))
   user  system elapsed 
 17.129   0.267  17.513

如何使这个R矩阵填充功能更快？

1 个答案: