Question

数据和要求

第一个表格（myMatrix1）来自旧的地质调查，该调查使用不同的地区边界（begin和finish）列进行较新的调查。我想做的是匹配开始和结束边界，然后创建两个表，一个用于沉降的新数据，另一个用于钻孔宽度的新数据，表示为布尔值。

myMatrix1 <- read.table("/path/to/file")
myMatrix2 <- read.table("/path/to/file")

> head(myMatrix1)  # this is the old data

    sampleIDs begin finish   
1    19990224 4     5 
2    20000224 5     6 
3    20010203 6     8 
4    20019024 29    30 
5    20020201 51    52 

> head(myMatrix2)   # this is the new data

     begin finish  sedimentation    boreWidth
1    0     10       1.002455        0.014354
2    11    367      2.094351        0.056431
3    368   920      0.450275        0.154105
4    921   1414     2.250820        1.004353
5    1415  5278     0.114109        NA`

期望的输出：

> head(myMatrix6)

    sampleIDs begin finish  sedimentation #myMatrix4
1    19990224 4     5       1.002455
2    20000224 5     6       1.002455
3    20010203 6     8       2.094351
4    20019024 29    30      2.094351
5    20020201 51    52      2.094351

> head(myMatrix7)

    sampleIDs begin finish  boreWidthThresh #myMatrix5
1    19990224 4     5       FALSE
2    20000224 5     6       FALSE
3    20010203 6     8       FALSE
4    20019024 29    30      FALSE
5    20020201 51    52      FALSE`

CODE

以下代码花了我几个小时来运行我的数据集（大约500万个数据点）。有没有办法更改代码，使其运行得更快？

# create empty matrix for sedimentation
myMatrix6 <- data.frame(NA,NA,NA,NA)[0,]
names(myMatrix6) <- letters[1:4]

# create empty matrix for bore
myMatrix7 <- data.frame(NA,NA,NA,NA)[0,]
names(myMatrix7) <- letters[1:4]

for (i in 1:nrow(myMatrix2))
{       
    # create matrix that has the value of myMatrix1$begin being 
    # situated between the values of myMatrix2begin[i] and myMatrix2finish[i]
    myMatrix3 <- myMatrix1[which((myMatrix1$begin > myMatrix2$begin[i]) & (myMatrix1$begin <      myMatrix2$finish[i])),]

    myMatrix4 <- rep(myMatrix2$sedimentation, nrow(myMatrix3))

    if (is.na(myMatrix2$boreWidth[i])) {
        myMatrix5 <- rep(NA, nrow(myMatrix3))
    }
    else if (myMatrix2$boreWidth[i] == 0) {
    myMatrix5 <- rep(TRUE, nrow(myMatrix3))
    }
    else if (myMatrix2$boreWidth[i] > 0) {
    myMatrix5 <- rep(FALSE, nrow(myMatrix3))
    }

    myMatrix6 <- rbind(myMatrix6, cbind(myMatrix3, myMatrix4))
    myMatrix7 <- rbind(myMatrix7, cbind(myMatrix3, myMatrix5))
}

编辑：

> dput(head(myMatrix2)

structure(list(V1 = structure(c(6L, 1L, 2L, 4L, 5L, 3L), .Label = c("0", 
"11", "1415", "368", "921", "begin"), class = "factor"), V2 = structure(c(6L, 
1L, 3L, 5L, 2L, 4L), .Label = c("10", "1414", "367", "5278", 
"920", "finish"), class = "factor"), V3 = structure(c(6L, 3L, 
4L, 2L, 5L, 1L), .Label = c("0.114109", "0.450275", "1.002455", 
"2.094351", "2.250820", "sedimentation"), class = "factor"), 
    V4 = structure(c(5L, 1L, 2L, 3L, 4L, 6L), .Label = c("0.014354", 
    "0.056431", "0.154105", "1.004353", "boreWidth", "NA"), class = "factor")), .Names = c("V1", 
"V2", "V3", "V4"), row.names = c(NA, 6L), class = "data.frame")

> dput(head(myMatrix1)

structure(list(V1 = structure(c(6L, 1L, 2L, 3L, 4L, 5L), .Label = c("19990224", 
"20000224", "20010203", "20019024", "20020201", "sampleIDs"), class = "factor"), 
    V2 = structure(c(6L, 2L, 3L, 5L, 1L, 4L), .Label = c("29", 
    "4", "5", "51", "6", "begin"), class = "factor"), V3 = structure(c(6L, 
    2L, 4L, 5L, 1L, 3L), .Label = c("30", "5", "52", "6", "8", 
    "finish"), class = "factor")), .Names = c("V1", "V2", "V3"
), row.names = c(NA, 6L), class = "data.frame")

Answer 1

首先看看有关加速代码的一般建议：https://stackoverflow.com/a/8474941/636656

我跳出来的第一件事就是我只创建了一个结果矩阵。这样你就不会复制sampleIDs begin finish列，并且可以避免运行匹配算法两次带来的任何开销。

这样做，你可以避免多次选择（尽管只要存储你的选择向量而不是重新计算，它在速度方面是微不足道的。）

以下是使用apply的解决方案：

myMatrix1 <- data.frame(sampleIDs=c(19990224,20000224),begin=c(4,5),finish=c(5,6))
myMatrix2 <- data.frame(begin=c(0,11),finish=c(10,367),sed=c(1.002,2.01),boreWidth=c(.014,.056))

glommer <- function(x,myMatrix2) {
  x[4:5] <- as.numeric(myMatrix2[ myMatrix2$begin <= x["begin"] & myMatrix2$finish >= x["finish"], c("sed","boreWidth") ])
  names(x)[4:5] <- c("sed","boreWidth")
  return( x )
}

> t(apply( myMatrix1, 1, glommer, myMatrix2=myMatrix2))
     sampleIDs begin finish   sed boreWidth
[1,]  19990224     4      5 1.002     0.014
[2,]  20000224     5      6 1.002     0.014

我使用apply并将所有内容存储为数字。其他方法是返回data.frame并拥有sampleID并开始，完成整数。这可能会避免浮点错误的一些问题。

此解决方案假设没有边界情况（例如myMatrix1的开始，结束时间完全包含在另一个的开始，结束时间内）。如果您的数据更复杂，只需更改glommer（）函数即可。你想如何处理这个问题是一个实质性的问题。

使Dataframe循环代码运行得更快

1 个答案: