我对在满足某些条件之前将行分组在一起有疑问。这是我的数据框。
| Gene | directon |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA | 11 |NA |
| fixB | 11 |15 |
| fixC | 11 |51 |
| fixX | 11 |-3 |
| kefF | 11 |108 |
| kefC | 11 |-7 |
| apaH | 12 |NA |
| apaG | 12 |7 |
我想在 intergenic_distance>50 之后对行进行分组,并在如下所示的同一方向内进行分组。
| Gene | directon |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA | 11 |NA |1 |
| fixB | 11 |15 |1 |
| fixC | 11 |51 |2 |
| fixX | 11 |-3 |2 |
| kefF | 11 |108 |3 |
| kefC | 11 |-7 |3 |
| apaH | 12 |NA |4 |
| apaG | 12 |7 |4 |
我正在考虑使用 with、rle、rep、seq_along,但我不知道如何使用。提前致谢!
dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI",
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801,
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799,
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399),
strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L,
310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L,
16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L,
16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB",
"thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013",
"b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-",
"-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E",
"COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L",
"COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I",
"homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family",
"conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC",
"toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA",
"predicted protein"), col = c("blue", "blue", "blue", "blue",
"blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue",
"blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue",
"blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8,
8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows",
"arrows", "arrows", "arrows", "arrows", "arrows", "arrows",
"arrows", "arrows", "arrows"), directon = c("1", "1", "1",
"4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82,
2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA,
-10L), groups = structure(list(directon = c("1", "4", "6", "8",
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
答案 0 :(得分:0)
您可以在以下情况下将 operon
值增加 1:
intergenic_distance
大于 50 并且不是 NA
ORdirecton
值与之前的 directon
值不同。library(dplyr)
df %>%
mutate(operon = cumsum(intergenic_distance > 50 & !is.na(intergenic_distance)|
directon != lag(directon, default = first(directon))) + 1)
# Gene directon intergenic_distance operon
#1 fixA 11 NA 1
#2 fixB 11 15 1
#3 fixC 11 51 2
#4 fixX 11 -3 2
#5 kefF 11 108 3
#6 kefC 11 -7 3
#7 apaH 12 NA 4
#8 apaG 12 7 4
数据
df <- structure(list(Gene = c("fixA", "fixB", "fixC", "fixX", "kefF",
"kefC", "apaH", "apaG"), directon = c(11L, 11L, 11L, 11L, 11L,
11L, 12L, 12L), intergenic_distance = c(NA, 15L, 51L, -3L, 108L,
-7L, NA, 7L)), row.names = c(NA, -8L), class = "data.frame")