R - 根据第一集为矢量赋值

时间:2018-01-05 19:01:08

标签: r vector sequence

所以我有一个看起来像这样的序列数据集

  id epnum clockst
1  1     1       0
2  1     2       1
3  1     3       2
4  2     1       4
5  2     2       5
6  2     3       6
7  3     1       4
8  3     2       5
9  3     3       6

我想要的是根据clockst创建epnum == 1的向量。

所以,我基本上想要这个

  id epnum clockst ep_start
1  1     1       0        0
2  1     2       1        0
3  1     3       2        0
4  2     1       4        4
5  2     2       5        4
6  2     3       6        4
7  3     1       4        4
8  3     2       5        4
9  3     3       6        4

然而,我很难这样做。

我想出了这个,但它没有完全奏效。

dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0, 
    ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))

有什么想法吗?

数据

dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"), 
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0", 
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id", 
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")

5 个答案:

答案 0 :(得分:3)

以下是使用tidyverse的解决方案:

首先检查条件epnum == 1以及TRUE,如果不是clockst,请使用NA值。然后只需使用以前的值填充NA

由于clockst是一个因素,因此需要将其转换为数字,同时保持相同的值,因此需要使用as.numeric(as.character(

library(tidyverse)
dt %>%
  mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
  fill(ep_start, .direction = "down")
#output:
  id epnum clockst ep_start
1  1     1       0        0
2  1     2       1        0
3  1     3       2        0
4  2     1       4        4
5  2     2       5        4
6  2     3       6        4
7  3     1       4        4
8  3     2       5        4
9  3     3       6        4

以下是可用答案的快速比较。我选择使用90 k行数据集:

df <- df[rep(1:nrow(df), times = 10000),] #where df = dt

dt <- data.table(df)

library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
                        missuse = df %>%
                          mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
                          fill(ep_start, .direction = "down"),
                        d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
                        www = df %>%
                          arrange(id, epnum) %>%
                          group_by(id) %>%
                          mutate(ep_start = first(clockst)) %>%
                          ungroup())

plot(bench)

enter image description here

具有900 k行数据集:

enter image description here

哦,我真的需要学习DT。

答案 1 :(得分:2)

另一个tidyverse解决方案。如果您确定行的顺序正确,则不需要arrange

library(dplyr)

dt2 <- dt %>%
  arrange(id, epnum) %>%
  group_by(id) %>%
  mutate(ep_start = first(clockst)) %>%
  ungroup()
dt2
# # A tibble: 9 x 4
#   id     epnum  clockst ep_start
#   <fctr> <fctr> <fctr>  <fctr>  
# 1 1      1      0       0       
# 2 1      2      1       0       
# 3 1      3      2       0       
# 4 2      1      4       4       
# 5 2      2      5       4       
# 6 2      3      6       4       
# 7 3      1      4       4       
# 8 3      2      5       4       
# 9 3      3      6       4   

答案 2 :(得分:1)

您可以使用library(data.table)执行此操作,如下所示

T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]

这给出了:

   id epnum clockst ep_start
1:  1     1       0        0
2:  1     2       1        0
3:  1     3       2        0
4:  2     1       4        4
5:  2     2       5        4
6:  2     3       6        4
7:  3     1       4        4
8:  3     2       5        4
9:  3     3       6        4

答案 3 :(得分:1)

dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
#  id epnum clockst ep_start
#1  1     1       0        0
#2  1     2       1        0
#3  1     3       2        0
#4  2     1       4        4
#5  2     2       5        4
#6  2     3       6        4
#7  3     1       4        4
#8  3     2       5        4
#9  3     3       6        4

答案 4 :(得分:0)

使用match

clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]