我正在使用R
我有什么:
ID_1 ID_2 Date x_1 y_2
1 12 3 2011-12-21 15 10
2 12 13 2011-12-22 50 40
3 3 12 2011-12-22 20 30
4 15 13 2011-12-23 30 20
...
and so on
TARGET:
ID_1 ID_2 Date x_1 y_2 XX_1 YY_2
1 12 3 2011-12-21 15 10 0 0
2 12 13 2011-12-22 50 40 15 0
3 3 12 2011-12-22 20 30 10 50
4 15 13 2011-12-23 30 20 0 40
...
and so on
我希望在XX_1
和YY_2
中看到x_1
和y_2
列中与先前ID_1
和{{{0}相对应的值1}} in或ID1_2
如果在该日期之前没有值可用。我不知道如何处理不同值可以在"0"
和ID_1
中的事实(例如示例中的ID 3和12)。
@Ekatef ID1和ID2(找到整个ID行的匹配,即使切换了ID的顺序):
ID_2
答案 0 :(得分:2)
OP要求将ID
(如果有)的先前值复制到相应的新列。
这可以通过同时重塑多个列从宽格式到长格式,通过移位/滞后找到前一个值,并重新变换为宽格式来解决:
library(data.table)
setDT(DF)[, rn := .I]
long <- melt(DF, id.vars = c("rn", "Date"), measure.vars = patterns("^ID", "^x|y"),
value.name = c("ID", "value"))
long[order(Date), previous := shift(value, fill = 0), by = ID]
dcast(long, rn + Date ~ variable, value.var = c("ID", "value", "previous"))
rn Date ID_1 ID_2 value_1 value_2 previous_1 previous_2 1: 1 2011-12-21 12 3 15 10 0 0 2: 2 2011-12-22 12 13 50 40 15 0 3: 3 2011-12-22 3 12 20 30 10 50 4: 4 2011-12-23 15 13 30 20 0 40
或者,在加入时,对dcast()
的最终通话可以替换为更新:
DF[long, on = .(rn),
c("XX_1", "YY_2") := .(previous[variable == 1L], previous[variable == 2L])][
, rn := NULL]
DF
ID_1 ID_2 Date x_1 y_2 XX_1 YY_2 1: 12 3 2011-12-21 15 10 0 0 2: 12 13 2011-12-22 50 40 15 0 3: 3 12 2011-12-22 20 30 10 50 4: 15 13 2011-12-23 30 20 0 40
可以准确再现OP的预期结果。
library(data.table)
DF <- fread(
"i ID_1 ID_2 Date x_1 y_2
1 12 3 2011-12-21 15 10
2 12 13 2011-12-22 50 40
3 3 12 2011-12-22 20 30
4 15 13 2011-12-23 30 20 ",
drop = 1L
)
答案 1 :(得分:1)
如果我理解正确的话,应该在严格高于给定ID值的所有行中从左到右,从下到上查找目标ID。我会将函数编写为找到前面ID的坐标,就像那样
# find the indices of the preceded ID value
# @id_matrix == your_data_frame[, c("ID_1", "ID_2")]
# [@i_of_row, @i_of_col] are the coordinates of the considered ID
# i_of_row > 1
FindPreviousID <- function(id_matrix, i_of_row, i_of_col) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
rev_ind <- match(table = rev(t(shorten_matrix)),
x = ids[i_of_row,i_of_col], nomatch = NA_real_)
n_row_found <- floor((length(shorten_matrix) - rev_ind)/2) + 1
n_col_found <- (length(shorten_matrix) - rev_ind) %% ncol(shorten_matrix) + 1
return(c(row = n_row_found, col = n_col_found))
}
...并将其用于计算XX_1和YY2
# emulate the original dataframe
ID_1 <- c(12,12,3,15,16,3)
ID_2<-c(3,13,12,13,17,15)
ids <- cbind(ID_1, ID_2) # IDs columns
x1 <- c(15, 50, 20, 30, 51, 60)
y2 <- c(10, 40, 30, 20, 53, 62)
vars <- cbind(x1, y2) # x&y columns
# assuming that the first XX_1 & YY_2 should be always 0
indices_XX <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 1, i),
X = seq(along.with = ids[, 1])[-1])
indices_YY <- sapply(FUN = function(i) FindPreviousID(id_matrix = ids, i_of_col = 2, i),
X = seq(along.with = ids[, 1])[-1])
# construct XX and YY columns
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0
希望,这有助于:)
更新如果您有兴趣找到一对ID而不是单个ID,则应重新设计该功能。其中一个可能的解决方案如下所示
FindPreviousIDsPair <- function(id_matrix, i_of_row) {
shorten_matrix <- id_matrix[1:(i_of_row - 1),,drop = FALSE]
string_to_search_for <- id_matrix[i_of_row, ]
string_to_search_for_sorted <-
string_to_search_for[order(string_to_search_for)]
found_rows_boolean <- sapply(FUN = function(i) all(shorten_matrix[i,
order(shorten_matrix[i, ])] ==
string_to_search_for_sorted), X = 1:(i_of_row - 1))
found_row_n <- ifelse(any(found_rows_boolean),
max(which(found_rows_boolean)), NA_real_)
found_col_of_DI1 <- ifelse(any(found_rows_boolean),
match(string_to_search_for[1], shorten_matrix[found_row_n, ]), NA_real_)
found_col_of_DI2 <- ifelse(any(found_rows_boolean),
match(string_to_search_for[2], shorten_matrix[found_row_n, ]), NA_real_)
return(c(found_row_n, found_col_of_DI1, found_col_of_DI2))
}
应用重新分配的查找功能来计算XX和YY
indices_of_vars <- sapply(FUN = function(i) FindPreviousIDsPair(id_matrix =
ids, i), X = seq(along.with = ids[, 1])[-1])
indices_XX <- indices_of_vars[1:2, ]
indices_YY <- indices_of_vars[c(1, 3), ]
XX_column <- c(NA, vars[t(indices_XX)])
XX_column[is.na(XX_column)] <- 0
YY_column <- c(NA, vars[t(indices_YY)])
YY_column[is.na(YY_column)] <- 0