Question

在R中，我遇到的问题类似于此处提出的问题：Replacing NAs in R with nearest value。但是，差异在于我想要更改的值不是NA而是任何小于0的值，并且更改这些值还取决于另一列中的值（因此需要添加条件语句）。我无法理解如何使该问题中提出的一些解决方案适应我的问题。由于我有大量数据，所以这很快也很重要。

示例数据

pred_trip <- c(0,0,0,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1)
locNumb <- c(-1,-1,-1,-1,-1,2,2,2,2,3,3,0,0,0,4,4,4,4,-1,-1,-1,-1,-1,0,0,0,0,0,0,5,5,5,5)
df <- data.frame(pred_trip, locNumb)

基本上，如果locNumb列中的值为＆lt; = 0并且pred_trip列中存在0，则locNumb列中的值将重新分配给大于0的最接近的值。

期望的输出：

   pred_trip locNumb
1          0       2
2          0       2
3          0       2
4          0       2
5          0       2
6          1       2
7          1       2
8          1       2
9          1       2
10         0       3
11         0       3
12         0       3
13         1       0
14         1       0
15         1       4
16         0       4
17         0       4
18         0       4
19         0       4
20         0       4
21         0       4
22         0       4
23         0       4
24         0       4
25         0       4
26         0       4
27         1       0
28         1       0
29         1       0
30         1       5
31         1       5
32         1       5
33         1       5

我在类似的解决方案中调整代码时遇到了麻烦，因为它依赖于is.na并且不包含我需要的任何其他条件。但是在伪代码中是这样的:(不知道在我的其他条件语句中添加的地方如果pred_trip == 0。

f1 <- function(df) {
  N <- length(df)
  na.pos <- which(df$locNumb < 0 (df))
  if (length(na.pos) %in% c(0, N)) {
    return(df)
  }
  non.na.pos <- which(!df$locNumb < 0(df))
  intervals  <- findInterval(na.pos, non.na.pos,
                             all.inside = TRUE)
  left.pos   <- non.na.pos[pmax(1, intervals)]
  right.pos  <- non.na.pos[pmin(N, intervals+1)]
  left.dist  <- na.pos - left.pos
  right.dist <- right.pos - na.pos

  df[na.pos] <- ifelse(left.dist <= right.dist,
                    df[left.pos], df[right.pos])
  return(df)
}

Answer 1

这是一种方法。

rle会为您提供行程编码，您可以使用NA替换负值，然后使用na.locf包中的zoo函数进行转发（和向后移动）最近的非负值。最后，inverse.rle函数可以创建您想要的向量，我们可以将其添加到原始data.frame df作为newlocNumb

可以使用任何其他条件将locNumb列中的部分原始值替换为newlocNumb列

require(zoo)
pred_trip <- c(0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1)
locNumb <- c(-1, -1, -1, -1, -1, 2, 2, 2, 2, 3, 3, 0, 0, 0, 4, 4, 4, 4, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5)
df <- data.frame(pred_trip, locNumb)

RLE <- rle(df$locNumb)

RLE
## Run Length Encoding
##   lengths: int [1:8] 5 4 2 3 4 5 6 4
##   values : num [1:8] -1 2 3 0 4 -1 0 5


RLE$values[RLE$values < 0] <- NA

while (any(is.na(RLE$values))) {
    RLE$values <- na.locf(na.locf(RLE$values, na.rm = FALSE), fromLast = TRUE, na.rm = FALSE)
}

df$newlocNumb <- inverse.rle(RLE)

df
##    pred_trip locNumb newlocNumb
## 1          0      -1          2
## 2          0      -1          2
## 3          0      -1          2
## 4          0      -1          2
## 5          0      -1          2
## 6          1       2          2
## 7          1       2          2
## 8          1       2          2
## 9          1       2          2
## 10         0       3          3
## 11         0       3          3
## 12         0       0          0
## 13         1       0          0
## 14         1       0          0
## 15         1       4          4
## 16         0       4          4
## 17         0       4          4
## 18         0       4          4
## 19         0      -1          4
## 20         0      -1          4
## 21         0      -1          4
## 22         0      -1          4
## 23         0      -1          4
## 24         0       0          0
## 25         0       0          0
## 26         0       0          0
## 27         1       0          0
## 28         1       0          0
## 29         1       0          0
## 30         1       5          5
## 31         1       5          5
## 32         1       5          5
## 33         1       5          5

Answer 2

data.table库，在内存使用方面非常有效btw，可以在这里使用 -

library(data.table)

# converting data.frame to data.table
dt <- data.table(df)

#assigning unique id to each row
dt[,grpno := .I]

# getting all the unique values from the data.table where locNumb > 0
positivelocNumb <- unique(dt[locNumb > 0])

# indexing by grpno, this will be used to help define nearest positive locnumb
setkeyv(positivelocNumb,c('grpno'))
setkeyv(dt,c('grpno'))

# nearest positive value join
dt2 <- positivelocNumb[dt, roll = "nearest"]

输出，其中pred_trip.1和locNumb.1是原始值，pred_trip和locNumb是最接近的正值。您可以通过pred_trip创建positivelocNumb来排除合并中的unique(dt[locNumb > 0,list(locNumb,grpno)])列 -

> dt2
    grpno pred_trip locNumb pred_trip.1 locNumb.1
 1:     1         1       2           0        -1
 2:     2         1       2           0        -1
 3:     3         1       2           0        -1
 4:     4         1       2           0        -1
 5:     5         1       2           0        -1
 6:     6         1       2           1         2
 7:     7         1       2           1         2
 8:     8         1       2           1         2
 9:     9         1       2           1         2
10:    10         0       3           0         3
11:    11         0       3           0         3
12:    12         0       3           0         0
13:    13         0       3           1         0
14:    14         1       4           1         0
15:    15         1       4           1         4
16:    16         0       4           0         4
17:    17         0       4           0         4
18:    18         0       4           0         4
19:    19         0       4           0        -1
20:    20         0       4           0        -1
21:    21         0       4           0        -1
22:    22         0       4           0        -1
23:    23         0       4           0        -1
24:    24         0       4           0         0
25:    25         1       5           0         0
26:    26         1       5           0         0
27:    27         1       5           1         0
28:    28         1       5           1         0
29:    29         1       5           1         0
30:    30         1       5           1         5
31:    31         1       5           1         5
32:    32         1       5           1         5
33:    33         1       5           1         5

用R中最接近的条件值替换行值

2 个答案: