从数据框中选择子序列

时间:2018-10-08 07:50:14

标签: r dataframe sequence

我有以下数据框:

df <- structure(list(a = c(1, 43, 22, 12, 35, 113, 54, 94), b = c("a", 
"b", "c", "d", "e", "f", "g", "h")), .Names = c("a", "b"), row.names = c(NA, 
-8L), class = c("tbl_df", "tbl", "data.frame"))

我要从该数据中选择一定长度的连续子序列。例如,对于两个长度的序列,我想选择1-2、2-3、3-4等行,直到数据帧的最后一行。然后应标记每个子序列。

子序列长度为2,带有其序列标签的新df如下所示:

a   b   seq_label
1   a   1 # First subsequence, row 1-2      
43  b   1 # 
43  b   2 # Second subsequence, row 2-3     
22  c   2 #         
22  c   3 # Third subsequence, row 3-4
12  d   3 #     
12  d   4
35  e   4       
35  e   5
113 f   5       
113 f   6
54  g   6       
54  g   7
94  h   7

类似,子序列长度为3:

a   b  seq_label
1   a  1 # First subsequence, row 1-3
43  b  1 #          
22  c  1 #
43  b  2 # Second subsequence, row 2-4
22  c  2 #
12  d  2 #
22  c  3 # Third subsequence, row 3-5
12  d  3 #
35  e  3 #
12  d  4
35  e  4
113 f  4
35  e  5
113 f  5
54  g  5
113 f  6
54  g  6
94  h  6

....

感谢@drjones的建议答案,我已经提出了解决方案:

map_dfr(1:(nrow(df) - n + 1), function (i) {cbind(df[i:(i + n - 1), ], "seq_label" = i)})

4 个答案:

答案 0 :(得分:1)

不确定数据集有多大,但如果可以,请循环使用

get_seq=function(df,n){
  res=c()
  for(i in 1:(nrow(df)-n+1)){
    res=rbind(res,cbind(df[i:(i+n-1),],"seq_label"=i))
  }
  res
}
get_seq(df,2)
a   b   seq_label
1   a   1       
43  b   1
43  b   2       
22  c   2       
22  c   3
12  d   3       
12  d   4
35  e   4       
35  e   5
113 f   5       
113 f   6
54  g   6       
54  g   7
94  h   7

get_seq(df,3)
a   b  seq_label
1   a  1
43  b  1            
22  c  1
43  b  2
22  c  2
12  d  2
22  c  3
12  d  3
35  e  3
12  d  4
35  e  4
113 f  4
35  e  5
113 f  5
54  g  5
113 f  6
54  g  6
94  h  6

答案 1 :(得分:1)

我们可以使用rollapply包中的zoo创建行索引。

library(zoo)

get_sequenced_df <- function(df, n) {
   new_df <- df[c(t(rollapply(1:nrow(df), n, c))), ]
   transform(new_df, seq_label = rep(seq(nrow(new_df)/n), each = n))
}

get_sequenced_df(df, 2)

#     a b seq_label
#1    1 a         1
#2   43 b         1
#3   43 b         2
#4   22 c         2
#5   22 c         3
#6   12 d         3
#7   12 d         4
#8   35 e         4
#9   35 e         5
#10 113 f         5
#11 113 f         6
#12  54 g         6
#13  54 g         7
#14  94 h         7

了解行索引的生成方式

n <- 2
c(t(rollapply(1:nrow(df), n, c)))
#[1] 1 2 2 3 3 4 4 5 5 6 6 7 7 8

n <- 3
c(t(rollapply(1:nrow(df), n, c)))
#[1] 1 2 3 2 3 4 3 4 5 4 5 6 5 6 7 6 7 8


get_sequenced_df(df, 3)
#     a b seq_label
#1    1 a         1
#2   43 b         1
#3   22 c         1
#4   43 b         2
#5   22 c         2
#6   12 d         2
#7   22 c         3
#8   12 d         3
#9   35 e         3
#10  12 d         4
#11  35 e         4
#12 113 f         4
#13  35 e         5
#14 113 f         5
#15  54 g         5
#16 113 f         6
#17  54 g         6
#18  94 h         6

答案 2 :(得分:1)

可能的替代解决方案:

n <- 2

ix1 <- rep(1:nrow(df), c(rep(n, nrow(df) - n), n:2))
ix2 <- unlist(Map(":", 0, c(rep(n, nrow(df) - n), n:2) - 1))

df2 <- df[ix1 + ix2,]
df2$seq_label <- ix1

给出:

> df2
     a b seq_label
1    1 a         1
2   43 b         1
3   43 b         2
4   22 c         2
5   22 c         3
6   12 d         3
7   12 d         4
8   35 e         4
9   35 e         5
10 113 f         5
11 113 f         6
12  54 g         6
13  54 g         7
14  94 h         7

使用n = 3,可以得到:

> df2
     a b seq_label
1    1 a         1
2   43 b         1
3   22 c         1
4   43 b         2
5   22 c         2
6   12 d         2
7   22 c         3
8   12 d         3
9   35 e         3
10  12 d         4
11  35 e         4
12 113 f         4
13  35 e         5
14 113 f         5
15  54 g         5
16 113 f         6
17  54 g         6
18  94 h         6
19  54 g         7
20  94 h         7

答案 3 :(得分:1)

我们可以使用outer创建索引:

n <- 2
i <- 1:(nrow(df) - (n - 1))

cbind(df[t(outer(i, 1:n - 1, `+`)), ],
      seq_label = rep(i, each = n))
#      a b seq_label
# 1    1 a         1
# 2   43 b         1
# 3   43 b         2
# 4   22 c         2
# 5   22 c         3
# 6   12 d         3
# 7   12 d         4
# 8   35 e         4
# 9   35 e         5
# 10 113 f         5
# 11 113 f         6
# 12  54 g         6
# 13  54 g         7
# 14  94 h         7

...或kronecker

cbind(df[kronecker(X = i, Y = 1:n - 1, FUN = `+`), ],
      seq_label = rep(i, each = n))

...或embed

i <- 1:nrow(df)
cbind(df[as.vector(t(embed(i, n)[ , n:1])), ],
      seq_label = rep(head(i, -(n - 1)), each = n))