你好,我有一个 df1 :
scaf_name coordinates value
JZSA01000001.1 1 2
JZSA01000001.1 2 2
JZSA01000001.1 3 2
JZSA01000001.1 4 2
JZSA01000001.1 5 2
JZSA01000001.1 6 2
JZSA01000001.1 7 2
JZSA01000001.1 8 2
JZSA01000001.1 9 2
JZSA01000001.1 10 2
JZSA01000001.1 11 5
JZSA01000001.1 12 5
JZSA01000001.1 13 5
JZSA01000001.1 14 5
JZSA01000001.1 15 5
JZSA01000001.1 16 5
JZSA01000001.1 17 5
JZSA01000001.1 18 6
JZSA01000002.1 1 2
JZSA01000002.1 2 2
JZSA01000002.1 3 2
JZSA01000002.1 4 2
JZSA01000002.1 5 2
JZSA01000002.1 6 2
JZSA01000003.1 1 5
JZSA01000003.1 2 5
JZSA01000003.1 3 6
JZSA01000003.1 4 6
JZSA01000003.1 5 6
JZSA01000003.1 6 6
JZSA01000003.1 7 6
JZSA01000003.1 8 6
JZSA01000003.1 9 6
和另一个 df_interval
scaffold start end
JZSA01000001.1_0 1 14
JZSA01000001.1_1 15 18
JZSA01000002.1 1 12
JZSA01000003.1_0 1 3
JZSA01000003.1_1 4 6
JZSA01000003.1_2 7 9
我想根据df1$scaf_name
和df1$scaf_name$start
df1$scaf_name$end
例如
每个df1$scaf_name
包含df_interval$scaffold
,并且df1$coordinates
之间的1-14
操作系统将被命名为JZSA01000001.1_0
在这里我应该得到输出
scaf_name coordinates value
JZSA01000001.1_0 1 2
JZSA01000001.1_0 2 2
JZSA01000001.1_0 3 2
JZSA01000001.1_0 4 2
JZSA01000001.1_0 5 2
JZSA01000001.1_0 6 2
JZSA01000001.1_0 7 2
JZSA01000001.1_0 8 2
JZSA01000001.1_0 9 2
JZSA01000001.1_0 10 2
JZSA01000001.1_0 11 5
JZSA01000001.1_0 12 5
JZSA01000001.1_0 13 5
JZSA01000001.1_0 14 5
JZSA01000001.1_1 15 5
JZSA01000001.1_1 16 5
JZSA01000001.1_1 17 5
JZSA01000001.1_1 18 6
JZSA01000002.1 1 2
JZSA01000002.1 2 2
JZSA01000002.1 3 2
JZSA01000002.1 4 2
JZSA01000002.1 5 2
JZSA01000002.1 6 2
JZSA01000003.1_0 1 5
JZSA01000003.1_0 2 5
JZSA01000003.1_0 3 6
JZSA01000003.1_1 4 6
JZSA01000003.1_1 5 6
JZSA01000003.1_1 6 6
JZSA01000003.1_2 7 6
JZSA01000003.1_2 8 6
JZSA01000003.1_2 9 6
df1文件非常大,如果有人想尽快实现,那将是惊人的。 谢谢
数据
df1
structure(list(scaf_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("JZSA01000001.1",
"JZSA01000002.1", "JZSA01000003.1"), class = "factor"), coor = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L), dinates.value = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 2L, 2L, 2L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L)), class = "data.frame", row.names = c(NA,
-33L))
df_interval
structure(list(scaffold = structure(1:6, .Label = c("JZSA01000001.1_0",
"JZSA01000001.1_1", "JZSA01000002.1", "JZSA01000003.1_0", "JZSA01000003.1_1",
"JZSA01000003.1_2"), class = "factor"), start = c(1L, 15L, 1L,
1L, 4L, 7L), end = c(14L, 18L, 12L, 3L, 6L, 9L)), class = "data.frame", row.names = c(NA,
-6L))
我有这个解决方案:
library(data.table)
setDT(df1)[df_interval, scaf_name := scaffold,
on = .(coordinates >= start, coordinates <= end)]
但是对于某些scaf_name
理论,它们已从输出中删除...
为Ronak编辑
这是我使用代码的等效项( df_interval ,在此后称为 interval_tab )的头
setDT(interval_tab)[,scaf_name:= sub(“(?<= [0-9])_。”,“”,scaffold,perl = TRUE)]
> head(interval_tab)
scaffold start end scaf_name
1: KQ759765.1 1 1417 KQ759765.1
2: KQ759766.1 1 1389 KQ759766.1
3: KQ759767.1_0 1 23930 KQ759767.1
4: KQ759767.1_1 23931 83220 KQ759767.1
5: KQ759767.1_2 83221 92117 KQ759767.1
6: KQ759767.1_3 92118 92679 KQ759767.1
这是等效的 df1 (在这里称为标签)的头
> head(tab)
V1 V2 V3
1: KQ759765.1 1 0
2: KQ759765.1 2 0
3: KQ759765.1 3 0
4: KQ759765.1 4 0
5: KQ759765.1 5 0
6: KQ759765.1 6 0
然后我使用了您的代码:
> setDT(tab)[interval_tab, scaf_name := scaffold,on = .(scaf_name, V2 >= start, V2 <= end)]
并收到错误消息
Error in colnamesInt(x, names(on), check_dups = FALSE) :
argument specifying columns specify non existing column(s): cols[1]='scaf_name'
答案 0 :(得分:2)
您需要创建一个df_interval
和df1
之间公用的密钥。为此,您可以创建一个新列以删除scaffold
列中下划线后的所有内容,然后进行联接。
library(data.table)
setDT(df_interval)[, scaf_name := sub('_.*', '', scaffold)]
setDT(df1)[df_interval, scaf_name := scaffold,on =
.(scaf_name, coor >= start, coor <= end)]
这将返回:
# scaf_name coor dinates.value
# 1: JZSA01000001.1_0 1 2
# 2: JZSA01000001.1_0 2 2
# 3: JZSA01000001.1_0 3 2
# 4: JZSA01000001.1_0 4 2
# 5: JZSA01000001.1_0 5 2
# 6: JZSA01000001.1_0 6 2
# 7: JZSA01000001.1_0 7 2
# 8: JZSA01000001.1_0 8 2
# 9: JZSA01000001.1_0 9 2
#10: JZSA01000001.1_0 10 2
#11: JZSA01000001.1_0 11 5
#12: JZSA01000001.1_0 12 5
#13: JZSA01000001.1_0 13 5
#14: JZSA01000001.1_0 14 5
#15: JZSA01000001.1_1 15 5
#16: JZSA01000001.1_1 16 5
#17: JZSA01000001.1_1 17 5
#18: JZSA01000001.1_1 18 6
#19: JZSA01000002.1 1 2
#20: JZSA01000002.1 2 2
#21: JZSA01000002.1 3 2
#22: JZSA01000002.1 4 2
#23: JZSA01000002.1 5 2
#24: JZSA01000002.1 6 2
#25: JZSA01000003.1_0 1 5
#26: JZSA01000003.1_0 2 5
#27: JZSA01000003.1_0 3 6
#28: JZSA01000003.1_1 4 6
#29: JZSA01000003.1_1 5 6
#30: JZSA01000003.1_1 6 6
#31: JZSA01000003.1_2 7 6
#32: JZSA01000003.1_2 8 6
#33: JZSA01000003.1_2 9 6
# scaf_name coor dinates.value
答案 1 :(得分:0)
这可能行得通,您只需要删除旧列即可:
library(dplyr)
df1 = structure(list(scaf_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("JZSA01000001.1",
"JZSA01000002.1", "JZSA01000003.1"), class = "factor"), coor = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L), dinates.value = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 2L, 2L, 2L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L)), class = "data.frame", row.names = c(NA,
-33L))
df_interval = structure(list(scaffold = structure(1:6, .Label = c("JZSA01000001.1_0",
"JZSA01000001.1_1", "JZSA01000002.1", "JZSA01000003.1_0", "JZSA01000003.1_1",
"JZSA01000003.1_2"), class = "factor"), start = c(1L, 15L, 1L,
1L, 4L, 7L), end = c(14L, 18L, 12L, 3L, 6L, 9L)), class = "data.frame", row.names = c(NA,
-6L))
df_t = df1 %>%
inner_join(df_interval, by = c("scaf_name" = "scaffold")) %>%
mutate(newcol = case_when(coor>= 1 & coor <= 14 ~ "JZSA01000001.1_0 ")) %>%
select(c("newcol", "coor", "dinates.value"))
编辑 现在,旧的值被替换了。我从内部加入改为了左加入
df_t = df1 %>%
left_join(df_interval, by = c("scaf_name" = "scaffold")) %>%
mutate(scaf_name = replace(scaf_name, coor >= 1 &
coor <= 14 &
!is.na(end), "JZSA01000001.1_0")) %>%
select(c("scaf_name", "coor", "dinates.value"))
编辑2
好吧,我想我明白了。假设可以使用下划线,这很有效,并且看起来像您的解决方案:
df_interval$shorti = do.call(rbind, strsplit(as.character(df_interval$scaffold), "_"))[,1]
df_t = df1 %>%
left_join(df_interval, by = c("scaf_name" = "shorti")) %>%
filter(coor >= start & coor <= end) %>%
mutate(scaf_name = ifelse(coor >= start &
coor <= end, as.character(scaffold), scaf_name)) %>%
select(c("scaf_name", "coor", "dinates.value"))