在另一列中的特定值之前和之后标记观察

时间:2017-09-15 01:42:28

标签: r dplyr data.table

说我有一个df:

df <- data.frame(flag = c(rep(0, 20)),
                 include = c(rep(1, 20)))
df[c(4,8,16), ]$flag <- 1
df

   flag include
1     0       1
2     0       1
3     0       1
4     1       1
5     0       1
6     0       1
7     0       1
8     1       1
9     0       1
10    0       1
11    0       1
12    0       1
13    0       1
14    0       1
15    0       1
16    1       1
17    0       1
18    0       1
19    0       1
20    0       1

如果行位于include行的+/-两行内,我希望将flag == 1标志更改为0。结果如下:

   flag include
1     0       1
2     0       0
3     0       0
4     1       1
5     0       0
6     0       0
7     0       0
8     1       1
9     0       0
10    0       0
11    0       1
12    0       1
13    0       1
14    0       0
15    0       0
16    1       1
17    0       0
18    0       0
19    0       1
20    0       1

我想到了一些“创新”(阅读:效率低下且过于复杂)的方法,但我认为必须有一种简单的方法让我忽视。

如果答案是这样我可以将其推广到+/- n行,那会很好,因为我有更多的数据,并且可能会在+/- 10行内进行搜索......

4 个答案:

答案 0 :(得分:3)

data.table的另一个选项:

library(data.table)
n = 2
# find the row number where flag is one
flag_one = which(df$flag == 1)

# find the index where include needs to be updated
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)

# update include in place
setDT(df)[idx[idx >= 1 & idx <= nrow(df)], include := 0][]

# or as @Frank commented the last step with base R would be
# df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0

#    flag include
# 1:    0       1
# 2:    0       0
# 3:    0       0
# 4:    1       1
# 5:    0       0
# 6:    0       0
# 7:    0       0
# 8:    1       1
# 9:    0       0
#10:    0       0
#11:    0       1
#12:    0       1
#13:    0       1
#14:    0       0
#15:    0       0
#16:    1       1
#17:    0       0
#18:    0       0
#19:    0       1
#20:    0       1

输入功能:

update_n <- function(df, n) {
    flag_one = which(df$flag == 1)
    idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
    df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
    df
}

答案 1 :(得分:2)

必须有另一种更简单的方法,但我能想到的第一种方法是使用sapplywhich

df$include[sapply(which(df$flag == 1) , function(x) c(x-2, x-1, x+1, x+2))] <- 0

df
#   flag include
#1     0       1
#2     0       0
#3     0       0
#4     1       1
#5     0       0
#6     0       0
#7     0       0
#8     1       1
#9     0       0
#10    0       0
#11    0       1
#12    0       1
#13    0       1
#14    0       0
#15    0       0
#16    1       1
#17    0       0
#18    0       0
#19    0       1
#20    0       1

我们首先找出flag为1的所有索引,然后围绕每个索引创建所需的数字序列,并将include的索引转换为0.

对于变量n,我们可以

n = 2
df$include[sapply(which(df$flag == 1),function(x) setdiff(seq(x-n, x+n),x))] <- 0

答案 2 :(得分:1)

n

对于replace(x = df$include, list = sapply(1:NROW(df), function(i) any(df$flag[c(max(1, i-n):max(1, i-1), min(i+1, NROW(df)):min(i+n, NROW(df)))] == 1)), values = 0) 行,

{
  "entry": [
    {
    "blockNumber": "1941794",
    "blockHash": "0x41ee74e34cbf9ef4116febea958dbc260e2da3a6bf6f601bfaeb2cd9ab944a29",
    "hash": "0xf2b5b8fb173e371cbb427625b0339f6023f8b4ec3701b7a5c691fa9cef9daf63",
    "from": "0x3c0cbb196e3847d40cb4d77d7dd3b386222998d9",
    "to": "0x2ba24c66cbff0bda0e3053ea07325479b3ed1393",
    "gas": "121000",
    "gasUsed": "21000",
    "gasPrice": "20000000000",
    "input": "",
    "logs": [],
    "nonce": "14",
    "value": "0x24406420d09ce7440000",
    "timestamp": "2016-07-24 20:28:11 UTC"
    },
    {
    "blockNumber": "1941716",
    "blockHash": "0x75e1602cad967a781f4a2ea9e19c97405fe1acaa8b9ad333fb7288d98f7b49e3",
    "hash": "0xf8f2a397b0f7bb1ff212b6bcc57e4a56ce3e27eb9f5839fef3e193c0252fab26",
    "from": "0xa0480c6f402b036e33e46f993d9c7b93913e7461",
    "to": "0xb2ea1f1f997365d1036dd6f00c51b361e9a3f351",
    "gas": "121000",
    "gasUsed": "21000",
    "gasPrice": "20000000000",
    "input": "",
    "logs": [],
    "nonce": "1",
    "value": "0xde0b6b3a7640000",
    "timestamp": "2016-07-24 20:12:17 UTC"
  }
]
}

答案 3 :(得分:1)

另一种方法是使用zoo::rollapply。要确定行是否在flag == 1行的+/-两行内,我们会检查窗口中的最大flag是否为1。

我们需要rollapply而不是rollmax,因为我们需要指定partial = T

is_within_flag_window <- function(flag, n) {
  zoo::rollapply(flag, width = (2 * n) + 1, partial = T, FUN = max) == 1
}

df %>%
  mutate(include = ifelse(flag == 1, 1,
                   ifelse(is_within_flag_window(flag, 2), 0,
                   1)))