具有先前结果的列

时间:2017-12-25 12:14:56

标签: r database dataframe

我正在使用R

我有什么:

      ID_1     ID_2      Date        x_1        y_2     
1      12       3     2011-12-21       15        10     
2      12       13    2011-12-22       50        40     
3      3        12    2011-12-22       20        30     
4      15       13    2011-12-23       30        20     
...
and so on

TARGET:

      ID_1     ID_2      Date        x_1        y_2     XX_1      YY_2
1      12       3     2011-12-21       15        10      0         0
2      12       13    2011-12-22       50        40      15        0
3      3        12    2011-12-22       20        30      10        50
4      15       13    2011-12-23       30        20      0         40
...
and so on

我希望在XX_1YY_2中看到x_1y_2列中与先前ID_1和{{{0}相对应的值1}} in或ID1_2如果在该日期之前没有值可用。我不知道如何处理不同值可以在"0"ID_1中的事实(例如示例中的ID 3和12)。

@Ekatef ID1和ID2(找到整个ID行的匹配,即使切换了ID的顺序):

ID_2

2 个答案:

答案 0 :(得分:2)

OP要求将ID(如果有)的先前值复制到相应的新列。

这可以通过同时重塑多个列从宽格式到长格式,通过移位/滞后找到前一个值,并重新变换为宽格式来解决:

library(data.table)
setDT(DF)[, rn := .I]
long <- melt(DF, id.vars = c("rn", "Date"), measure.vars = patterns("^ID", "^x|y"),
             value.name = c("ID", "value"))
long[order(Date), previous := shift(value, fill = 0), by = ID]
dcast(long, rn + Date ~ variable, value.var = c("ID", "value", "previous"))
   rn       Date ID_1 ID_2 value_1 value_2 previous_1 previous_2
1:  1 2011-12-21   12    3      15      10          0          0
2:  2 2011-12-22   12   13      50      40         15          0
3:  3 2011-12-22    3   12      20      30         10         50
4:  4 2011-12-23   15   13      30      20          0         40

或者,在加入时,对dcast()的最终通话可以替换为更新:

DF[long, on = .(rn), 
   c("XX_1", "YY_2") := .(previous[variable == 1L], previous[variable == 2L])][
     , rn := NULL]
DF
   ID_1 ID_2       Date x_1 y_2 XX_1 YY_2
1:   12    3 2011-12-21  15  10    0    0
2:   12   13 2011-12-22  50  40   15    0
3:    3   12 2011-12-22  20  30   10   50
4:   15   13 2011-12-23  30  20    0   40

可以准确再现OP的预期结果。

数据

library(data.table)
DF <- fread(
  "i      ID_1     ID_2      Date        x_1        y_2     
  1      12       3     2011-12-21       15        10     
  2      12       13    2011-12-22       50        40     
  3      3        12    2011-12-22       20        30     
  4      15       13    2011-12-23       30        20  ",
  drop = 1L
)

答案 1 :(得分:1)

如果我理解正确的话,应该在严格高于给定ID值的所有行中从左到右,从下到上查找目标ID。我会将函数编写为找到前面ID的坐标,就像那样

# find the indices of the preceded ID value
# @id_matrix == your_data_frame[, c("ID_1", "ID_2")]
# [@i_of_row, @i_of_col] are the coordinates of the considered ID
# i_of_row > 1
FindPreviousID <- function(id_matrix, i_of_row, i_of_col) {
    shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
    rev_ind <- match(table = rev(t(shorten_matrix)), 
        x = ids[i_of_row,i_of_col], nomatch = NA_real_)
    n_row_found <- floor((length(shorten_matrix) - rev_ind)/2) + 1
    n_col_found <- (length(shorten_matrix) - rev_ind) %% ncol(shorten_matrix) + 1
    return(c(row = n_row_found, col = n_col_found))
}

...并将其用于计算XX_1和YY2

# emulate the original dataframe
ID_1 <- c(12,12,3,15,16,3)
ID_2<-c(3,13,12,13,17,15)
ids <- cbind(ID_1, ID_2) # IDs columns
x1 <- c(15, 50, 20, 30, 51, 60)
y2 <- c(10, 40, 30, 20, 53, 62)
vars <- cbind(x1, y2) # x&y columns
# assuming that the first XX_1 & YY_2 should be always 0 
indices_XX  <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 1, i), 
    X = seq(along.with = ids[, 1])[-1])
indices_YY  <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 2, i), 
    X = seq(along.with = ids[, 1])[-1])
#  construct XX and YY columns
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0

希望,这有助于:)

更新如果您有兴趣找到一对ID而不是单个ID,则应重新设计该功能。其中一个可能的解决方案如下所示

FindPreviousIDsPair <- function(id_matrix, i_of_row) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
string_to_search_for <- id_matrix[i_of_row, ]
string_to_search_for_sorted <- 
    string_to_search_for[order(string_to_search_for)]
found_rows_boolean <- sapply(FUN = function(i) all(shorten_matrix[i, 
    order(shorten_matrix[i, ])] == 
    string_to_search_for_sorted), X = 1:(i_of_row - 1)) 
found_row_n <- ifelse(any(found_rows_boolean),
    max(which(found_rows_boolean)), NA_real_)
found_col_of_DI1 <- ifelse(any(found_rows_boolean), 
    match(string_to_search_for[1], shorten_matrix[found_row_n, ]), NA_real_)
found_col_of_DI2 <- ifelse(any(found_rows_boolean), 
    match(string_to_search_for[2], shorten_matrix[found_row_n, ]), NA_real_)    
return(c(found_row_n, found_col_of_DI1, found_col_of_DI2))
}

应用重新分配的查找功能来计算XX和YY

indices_of_vars  <- sapply(FUN = function(i) FindPreviousIDsPair(id_matrix = 
     ids, i), X = seq(along.with = ids[, 1])[-1])
indices_XX <- indices_of_vars[1:2, ]
indices_YY <- indices_of_vars[c(1, 3), ]
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0