你好,我有一个 df1 :
scaf_name coordinates value
JZSA01000001.1 1 2
JZSA01000001.1 2 2
JZSA01000001.1 3 2
JZSA01000001.1 4 2
JZSA01000001.1 5 2
JZSA01000001.1 6 2
JZSA01000001.1 7 2
JZSA01000001.1 8 2
JZSA01000001.1 9 2
JZSA01000001.1 10 2
JZSA01000001.1 11 5
JZSA01000001.1 12 5
JZSA01000001.1 13 5
JZSA01000001.1 14 5
JZSA01000001.1 15 5
JZSA01000001.1 16 5
JZSA01000001.1 17 5
JZSA01000001.1 18 6
JZSA01000002.1 1 2
JZSA01000002.1 2 2
JZSA01000002.1 3 2
JZSA01000002.1 4 2
JZSA01000002.1 5 2
JZSA01000002.1 6 2
JZSA01000003.1 1 5
JZSA01000003.1 2 5
JZSA01000003.1 3 6
JZSA01000003.1 4 6
JZSA01000003.1 5 6
JZSA01000003.1 6 6
JZSA01000003.1 7 6
JZSA01000003.1 8 6
JZSA01000003.1 9 6
和另一个 df_interval
scaffold start end
JZSA01000001.1_0 1 14
JZSA01000001.1_1 15 18
JZSA01000002.1 1 12
JZSA01000003.1_0 1 3
JZSA01000003.1_1 4 6
JZSA01000003.1_2 7 9
我想根据df1$scaf_name
和df1$scaf_name$start
df1$scaf_name$end
例如
每个df1$scaf_name
包含df_interval$scaffold
,并且df1$coordinates
之间的1-14
操作系统将被命名为JZSA01000001.1_0
在这里我应该得到输出
scaf_name coordinates value
JZSA01000001.1_0 1 2
JZSA01000001.1_0 2 2
JZSA01000001.1_0 3 2
JZSA01000001.1_0 4 2
JZSA01000001.1_0 5 2
JZSA01000001.1_0 6 2
JZSA01000001.1_0 7 2
JZSA01000001.1_0 8 2
JZSA01000001.1_0 9 2
JZSA01000001.1_0 10 2
JZSA01000001.1_0 11 5
JZSA01000001.1_0 12 5
JZSA01000001.1_0 13 5
JZSA01000001.1_0 14 5
JZSA01000001.1_1 15 5
JZSA01000001.1_1 16 5
JZSA01000001.1_1 17 5
JZSA01000001.1_1 18 6
JZSA01000002.1 1 2
JZSA01000002.1 2 2
JZSA01000002.1 3 2
JZSA01000002.1 4 2
JZSA01000002.1 5 2
JZSA01000002.1 6 2
JZSA01000003.1_0 1 5
JZSA01000003.1_0 2 5
JZSA01000003.1_0 3 6
JZSA01000003.1_1 4 6
JZSA01000003.1_1 5 6
JZSA01000003.1_1 6 6
JZSA01000003.1_2 7 6
JZSA01000003.1_2 8 6
JZSA01000003.1_2 9 6
df1文件非常大,如果有人想尽快实现,那将是惊人的。 谢谢
数据
df1
structure(list(scaf_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("JZSA01000001.1",
"JZSA01000002.1", "JZSA01000003.1"), class = "factor"), coor = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L), dinates.value = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 2L, 2L, 2L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L)), class = "data.frame", row.names = c(NA,
-33L))
df_interval
structure(list(scaffold = structure(1:6, .Label = c("JZSA01000001.1_0",
"JZSA01000001.1_1", "JZSA01000002.1", "JZSA01000003.1_0", "JZSA01000003.1_1",
"JZSA01000003.1_2"), class = "factor"), start = c(1L, 15L, 1L,
1L, 4L, 7L), end = c(14L, 18L, 12L, 3L, 6L, 9L)), class = "data.frame", row.names = c(NA,
-6L))
答案 0 :(得分:2)
我们可以使用data.table
library(data.table)
setDT(df1)[df_interval, scaf_name := scaffold,
on = .(coordinates >= start, coordinates <= end)]