这是我的data.frame的头部,在r中的df。任何行或列都没有模式。
{
"keyOne": "String",
"keyTwo": 7,
"keyThree": {
"subKeyOne": "String",
"url": "String"
},
"arrayKey": [
{
"arrayKeyOne":"String",
"url": "String"
},
{
"arrayKeyOne":"String",
"url": "String"
}
],
"url":"String"
}
对于df $ Type ==" B"的每一行,我想用df $ Type ==" A"找到上一行和下一行,然后提取他们的&# 34; V1"和" V2"。
期望的输出,
Type SIZE V1 V2
A 1 5 7
B 1 NA NA
B 3 NA NA
B 4 NA NA
A 8 2 4
A 6 6 50
A 12 2 8
B 8 NA NA
A 9 51 63
A 11 93 70
非常感谢有人可以提供帮助,
答案 0 :(得分:1)
例如,首先存储type
为A
的索引...例如,
dat <- data.frame(type = c("A", "B", "B", "B", "A", "A", "A", "B", "A", "A"),
size = c(1, 1, 3, 4, 8, 6, 12, 8, 9, 11),
v1 = c(5, NA, NA, NA, 2, 6, 2, NA, 51, 93),
v2 = c(7, NA, NA, NA, 4, 50, 8, NA, 63, 70))
dat$idx <- 1:nrow(dat)
a_idx <- which(dat$type == "A")
b_idx <- which(dat$type == "B")
然后,您可以轻松找到最后一个/ {下一个B
>
和<
... sapply
,
new <- sapply(b_idx, function(x) {
lag_idx <- tail(a_idx[a_idx < x], 1)
lead_idx <- head(a_idx[a_idx > x], 1)
return (t(c(dat$v1[lag_idx], dat$v2[lag_idx],
dat$v1[lead_idx], dat$v2[lead_idx])))
}
)
new <- t(new)
new <- cbind(new, b_idx)
colnames(new) <- c("V1_Lag", "V2_Lag", "V1_Lead", "V2_Lead", "idx")
merge(dat, new, all = TRUE)
idx type size v1 v2 V1_Lag V2_Lag V1_Lead V2_Lead
1 1 A 1 5 7 NA NA NA NA
2 2 B 1 NA NA 5 7 2 4
3 3 B 3 NA NA 5 7 2 4
4 4 B 4 NA NA 5 7 2 4
5 5 A 8 2 4 NA NA NA NA
6 6 A 6 6 50 NA NA NA NA
7 7 A 12 2 8 NA NA NA NA
8 8 B 8 NA NA 2 8 51 63
9 9 A 9 51 63 NA NA NA NA
10 10 A 11 93 70 NA NA NA NA
答案 1 :(得分:1)
使用此数据
dat <- data.frame(
type = c("A", "B", "B", "B", "A", "A", "A", "B", "A", "A"),
size = c(1, 1, 3, 4, 8, 6, 12, 8, 9, 11),
v1 = c(5, NA, NA, NA, 2, 6, 2, NA, 51, 93),
v2 = c(7, NA, NA, NA, 4, 50, 8, NA, 63, 70),
stringsAsFactors = FALSE
)
计算&#39;游程长度编码&#39; type
列
r <- rle(dat$type)
与
> r
Run Length Encoding
lengths: int [1:5] 1 3 3 1 2
values : chr [1:5] "A" "B" "A" "B" "A"
(即,1A,然后是3B,3A,1B和2A)。滞后值的指数是
lag <- setdiff(
cumsum(r$lengths)[r$values == "A"],
nrow(dat) # ignore "A" value at end of column
)
需要复制每个滞后值,新值v1lag
填充为
value <- rep(dat$v1[lag], r$length[r$value == "B"])
类似的故事与主导价值
一起发挥作用lead <- pmin(
cumsum(r$lengths)[r$values == "B"] + 1L,
nrow(dat) # ignore "B" value at end of column
)
value <- rep(dat$v1[lead], r$length[r$value == "B"])
具体问题的实施是
mm <- function(df) {
r <- rle(df$type)
lag <- setdiff(cumsum(r$lengths)[r$values == "A"], nrow(df))
lead <- pmin(cumsum(r$lengths)[r$values == "B"] + 1L, nrow(df))
len <- r$length[r$value == "B"]
idx <- df$type == "B"
df$v1_lag[idx] <- rep(df$v1[lag], len)
df$v2_lag[idx] <- rep(df$v2[lag], len)
df$v1_lead[idx] <- rep(df$v1[lead], len)
df$v2_lead[idx] <- rep(df$v2[lead], len)
df
}
这比erocoar的解决方案更快,更强大。