Question

我正在使用R

我有什么：

      ID_1     ID_2      Date        x_1        y_2     
1      12       3     2011-12-21       15        10     
2      12       13    2011-12-22       50        40     
3      3        12    2011-12-22       20        30     
4      15       13    2011-12-23       30        20     
...
and so on

TARGET：

      ID_1     ID_2      Date        x_1        y_2     XX_1      YY_2
1      12       3     2011-12-21       15        10      0         0
2      12       13    2011-12-22       50        40      15        0
3      3        12    2011-12-22       20        30      10        50
4      15       13    2011-12-23       30        20      0         40
...
and so on

我希望在XX_1和YY_2中看到x_1和y_2列中与先前ID_1和{{{0}相对应的值1}} in或ID1_2如果在该日期之前没有值可用。我不知道如何处理不同值可以在"0"和ID_1中的事实（例如示例中的ID 3和12）。

@Ekatef ID1和ID2（找到整个ID行的匹配，即使切换了ID的顺序）：

ID_2

Answer 1

OP要求将ID（如果有）的先前值复制到相应的新列。

这可以通过同时重塑多个列从宽格式到长格式，通过移位/滞后找到前一个值，并重新变换为宽格式来解决：

library(data.table)
setDT(DF)[, rn := .I]
long <- melt(DF, id.vars = c("rn", "Date"), measure.vars = patterns("^ID", "^x|y"),
             value.name = c("ID", "value"))
long[order(Date), previous := shift(value, fill = 0), by = ID]
dcast(long, rn + Date ~ variable, value.var = c("ID", "value", "previous"))

   rn       Date ID_1 ID_2 value_1 value_2 previous_1 previous_2
1:  1 2011-12-21   12    3      15      10          0          0
2:  2 2011-12-22   12   13      50      40         15          0
3:  3 2011-12-22    3   12      20      30         10         50
4:  4 2011-12-23   15   13      30      20          0         40

或者，在加入时，对dcast()的最终通话可以替换为更新：

DF[long, on = .(rn), c("XX_1", "YY_2") := .(previous[variable == 1L], previous[variable == 2L])][ , rn := NULL] DF

ID_1 ID_2 Date x_1 y_2 XX_1 YY_2 1: 12 3 2011-12-21 15 10 0 0 2: 12 13 2011-12-22 50 40 15 0 3: 3 12 2011-12-22 20 30 10 50 4: 15 13 2011-12-23 30 20 0 40

可以准确再现OP的预期结果。

数据

library(data.table) DF <- fread( "i ID_1 ID_2 Date x_1 y_2 1 12 3 2011-12-21 15 10 2 12 13 2011-12-22 50 40 3 3 12 2011-12-22 20 30 4 15 13 2011-12-23 30 20 ", drop = 1L )

Answer 2

如果我理解正确的话，应该在严格高于给定ID值的所有行中从左到右，从下到上查找目标ID。我会将函数编写为找到前面ID的坐标，就像那样

# find the indices of the preceded ID value
# @id_matrix == your_data_frame[, c("ID_1", "ID_2")]
# [@i_of_row, @i_of_col] are the coordinates of the considered ID
# i_of_row > 1
FindPreviousID <- function(id_matrix, i_of_row, i_of_col) {
    shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
    rev_ind <- match(table = rev(t(shorten_matrix)), 
        x = ids[i_of_row,i_of_col], nomatch = NA_real_)
    n_row_found <- floor((length(shorten_matrix) - rev_ind)/2) + 1
    n_col_found <- (length(shorten_matrix) - rev_ind) %% ncol(shorten_matrix) + 1
    return(c(row = n_row_found, col = n_col_found))
}

...并将其用于计算XX_1和YY2

# emulate the original dataframe
ID_1 <- c(12,12,3,15,16,3)
ID_2<-c(3,13,12,13,17,15)
ids <- cbind(ID_1, ID_2) # IDs columns
x1 <- c(15, 50, 20, 30, 51, 60)
y2 <- c(10, 40, 30, 20, 53, 62)
vars <- cbind(x1, y2) # x&y columns
# assuming that the first XX_1 & YY_2 should be always 0 
indices_XX  <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 1, i), 
    X = seq(along.with = ids[, 1])[-1])
indices_YY  <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 2, i), 
    X = seq(along.with = ids[, 1])[-1])
#  construct XX and YY columns
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0

希望，这有助于：）

更新如果您有兴趣找到一对ID而不是单个ID，则应重新设计该功能。其中一个可能的解决方案如下所示

FindPreviousIDsPair <- function(id_matrix, i_of_row) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
string_to_search_for <- id_matrix[i_of_row, ]
string_to_search_for_sorted <- 
    string_to_search_for[order(string_to_search_for)]
found_rows_boolean <- sapply(FUN = function(i) all(shorten_matrix[i, 
    order(shorten_matrix[i, ])] == 
    string_to_search_for_sorted), X = 1:(i_of_row - 1)) 
found_row_n <- ifelse(any(found_rows_boolean),
    max(which(found_rows_boolean)), NA_real_)
found_col_of_DI1 <- ifelse(any(found_rows_boolean), 
    match(string_to_search_for[1], shorten_matrix[found_row_n, ]), NA_real_)
found_col_of_DI2 <- ifelse(any(found_rows_boolean), 
    match(string_to_search_for[2], shorten_matrix[found_row_n, ]), NA_real_)    
return(c(found_row_n, found_col_of_DI1, found_col_of_DI2))
}

应用重新分配的查找功能来计算XX和YY

indices_of_vars  <- sapply(FUN = function(i) FindPreviousIDsPair(id_matrix = 
     ids, i), X = seq(along.with = ids[, 1])[-1])
indices_XX <- indices_of_vars[1:2, ]
indices_YY <- indices_of_vars[c(1, 3), ]
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0

具有先前结果的列

2 个答案:

数据