使用dplyr以便根据另一个df更改列值

时间:2020-10-08 12:34:13

标签: r dataframe dplyr data.table

你好,我有一个 df1

scaf_name       coordinates value    
JZSA01000001.1  1   2
JZSA01000001.1  2   2
JZSA01000001.1  3   2
JZSA01000001.1  4   2
JZSA01000001.1  5   2
JZSA01000001.1  6   2
JZSA01000001.1  7   2
JZSA01000001.1  8   2
JZSA01000001.1  9   2
JZSA01000001.1  10  2
JZSA01000001.1  11  5
JZSA01000001.1  12  5
JZSA01000001.1  13  5
JZSA01000001.1  14  5
JZSA01000001.1  15  5
JZSA01000001.1  16  5
JZSA01000001.1  17  5
JZSA01000001.1  18  6
JZSA01000002.1  1   2
JZSA01000002.1  2   2
JZSA01000002.1  3   2
JZSA01000002.1  4   2
JZSA01000002.1  5   2
JZSA01000002.1  6   2
JZSA01000003.1  1   5
JZSA01000003.1  2   5
JZSA01000003.1  3   6
JZSA01000003.1  4   6
JZSA01000003.1  5   6
JZSA01000003.1  6   6
JZSA01000003.1  7   6
JZSA01000003.1  8   6
JZSA01000003.1  9   6

和另一个 df_interval

scaffold          start     end
JZSA01000001.1_0  1         14
JZSA01000001.1_1  15        18
JZSA01000002.1    1         12
JZSA01000003.1_0  1         3
JZSA01000003.1_1  4         6
JZSA01000003.1_2  7         9

我想根据df1$scaf_namedf1$scaf_name$start

更改df1$scaf_name$end

例如

每个df1$scaf_name包含df_interval$scaffold,并且df1$coordinates之间的1-14操作系统将被命名为JZSA01000001.1_0

在这里我应该得到输出

scaf_name       coordinates value    
JZSA01000001.1_0    1   2
JZSA01000001.1_0    2   2
JZSA01000001.1_0    3   2
JZSA01000001.1_0    4   2
JZSA01000001.1_0    5   2
JZSA01000001.1_0    6   2
JZSA01000001.1_0    7   2
JZSA01000001.1_0    8   2
JZSA01000001.1_0    9   2
JZSA01000001.1_0    10  2
JZSA01000001.1_0    11  5
JZSA01000001.1_0    12  5
JZSA01000001.1_0    13  5
JZSA01000001.1_0    14  5
JZSA01000001.1_1    15  5
JZSA01000001.1_1    16  5
JZSA01000001.1_1    17  5
JZSA01000001.1_1    18  6
JZSA01000002.1      1   2
JZSA01000002.1      2   2
JZSA01000002.1      3   2
JZSA01000002.1      4   2
JZSA01000002.1      5   2
JZSA01000002.1      6   2
JZSA01000003.1_0    1   5
JZSA01000003.1_0    2   5
JZSA01000003.1_0    3   6
JZSA01000003.1_1    4   6
JZSA01000003.1_1    5   6
JZSA01000003.1_1    6   6
JZSA01000003.1_2    7   6
JZSA01000003.1_2    8   6
JZSA01000003.1_2    9   6

df1文件非常大,如果有人想尽快实现,那将是惊人的。 谢谢

数据

df1

structure(list(scaf_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("JZSA01000001.1", 
"JZSA01000002.1", "JZSA01000003.1"), class = "factor"), coor = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
16L, 17L, 18L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L), dinates.value = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 2L, 2L, 2L, 
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L)), class = "data.frame", row.names = c(NA, 
-33L))

df_interval

structure(list(scaffold = structure(1:6, .Label = c("JZSA01000001.1_0", 
"JZSA01000001.1_1", "JZSA01000002.1", "JZSA01000003.1_0", "JZSA01000003.1_1", 
"JZSA01000003.1_2"), class = "factor"), start = c(1L, 15L, 1L, 
1L, 4L, 7L), end = c(14L, 18L, 12L, 3L, 6L, 9L)), class = "data.frame", row.names = c(NA, 
-6L))

我有这个解决方案:

library(data.table)
setDT(df1)[df_interval, scaf_name := scaffold,
         on =  .(coordinates >= start, coordinates <= end)]

但是对于某些scaf_name理论,它们已从输出中删除...


为Ronak编辑

这是我使用代码的等效项( df_interval ,在此后称为 interval_tab )的头

setDT(interval_tab)[,scaf_name:= sub(“(?<= [0-9])_。”,“”,scaffold,perl = TRUE)]

> head(interval_tab)
       scaffold start   end  scaf_name
1:   KQ759765.1     1  1417 KQ759765.1
2:   KQ759766.1     1  1389 KQ759766.1
3: KQ759767.1_0     1 23930 KQ759767.1
4: KQ759767.1_1 23931 83220 KQ759767.1
5: KQ759767.1_2 83221 92117 KQ759767.1
6: KQ759767.1_3 92118 92679 KQ759767.1

这是等效的 df1 (在这里称为标签)的头

> head(tab)
           V1 V2 V3
1: KQ759765.1  1  0
2: KQ759765.1  2  0
3: KQ759765.1  3  0
4: KQ759765.1  4  0
5: KQ759765.1  5  0
6: KQ759765.1  6  0

然后我使用了您的代码:

> setDT(tab)[interval_tab, scaf_name := scaffold,on =  .(scaf_name, V2 >= start, V2 <= end)]

并收到错误消息

Error in colnamesInt(x, names(on), check_dups = FALSE) : 
  argument specifying columns specify non existing column(s): cols[1]='scaf_name'

2 个答案:

答案 0 :(得分:2)

您需要创建一个df_intervaldf1之间公用的密钥。为此,您可以创建一个新列以删除scaffold列中下划线后的所有内容,然后进行联接。

library(data.table)
setDT(df_interval)[, scaf_name := sub('_.*', '', scaffold)]

setDT(df1)[df_interval, scaf_name := scaffold,on =  
                       .(scaf_name, coor >= start, coor <= end)]

这将返回:

#           scaf_name coor dinates.value
# 1: JZSA01000001.1_0    1             2
# 2: JZSA01000001.1_0    2             2
# 3: JZSA01000001.1_0    3             2
# 4: JZSA01000001.1_0    4             2
# 5: JZSA01000001.1_0    5             2
# 6: JZSA01000001.1_0    6             2
# 7: JZSA01000001.1_0    7             2
# 8: JZSA01000001.1_0    8             2
# 9: JZSA01000001.1_0    9             2
#10: JZSA01000001.1_0   10             2
#11: JZSA01000001.1_0   11             5
#12: JZSA01000001.1_0   12             5
#13: JZSA01000001.1_0   13             5
#14: JZSA01000001.1_0   14             5
#15: JZSA01000001.1_1   15             5
#16: JZSA01000001.1_1   16             5
#17: JZSA01000001.1_1   17             5
#18: JZSA01000001.1_1   18             6
#19:   JZSA01000002.1    1             2
#20:   JZSA01000002.1    2             2
#21:   JZSA01000002.1    3             2
#22:   JZSA01000002.1    4             2
#23:   JZSA01000002.1    5             2
#24:   JZSA01000002.1    6             2
#25: JZSA01000003.1_0    1             5
#26: JZSA01000003.1_0    2             5
#27: JZSA01000003.1_0    3             6
#28: JZSA01000003.1_1    4             6
#29: JZSA01000003.1_1    5             6
#30: JZSA01000003.1_1    6             6
#31: JZSA01000003.1_2    7             6
#32: JZSA01000003.1_2    8             6
#33: JZSA01000003.1_2    9             6
#           scaf_name coor dinates.value

答案 1 :(得分:0)

这可能行得通,您只需要删除旧列即可:

library(dplyr)
df1 = structure(list(scaf_name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
                                             1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
                                             2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("JZSA01000001.1", 
                                                                                                     "JZSA01000002.1", "JZSA01000003.1"), class = "factor"), coor = c(1L, 
                                                                                                                                                                      2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
                                                                                                                                                                      16L, 17L, 18L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 
                                                                                                                                                                      7L, 8L, 9L), dinates.value = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                                                                                                     2L, 2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                                                                                                     5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                               -33L))
df_interval = structure(list(scaffold = structure(1:6, .Label = c("JZSA01000001.1_0", 
                                                                  "JZSA01000001.1_1", "JZSA01000002.1", "JZSA01000003.1_0", "JZSA01000003.1_1", 
                                                                  "JZSA01000003.1_2"), class = "factor"), start = c(1L, 15L, 1L, 
                                                                                                                    1L, 4L, 7L), end = c(14L, 18L, 12L, 3L, 6L, 9L)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                          -6L))
df_t = df1 %>%
  inner_join(df_interval, by = c("scaf_name" = "scaffold")) %>%
  mutate(newcol = case_when(coor>= 1 & coor <= 14 ~ "JZSA01000001.1_0 ")) %>%
  select(c("newcol", "coor", "dinates.value"))

编辑 现在,旧的值被替换了。我从内部加入改为了左加入

df_t = df1 %>%
  left_join(df_interval, by = c("scaf_name" = "scaffold")) %>%
  mutate(scaf_name = replace(scaf_name, coor >= 1 &
                               coor <= 14 &
                               !is.na(end), "JZSA01000001.1_0")) %>% 
  select(c("scaf_name", "coor", "dinates.value"))

编辑2

好吧,我想我明白了。假设可以使用下划线,这很有效,并且看起来像您的解决方案:

df_interval$shorti = do.call(rbind, strsplit(as.character(df_interval$scaffold), "_"))[,1]
df_t = df1 %>%
  left_join(df_interval, by = c("scaf_name" = "shorti")) %>%
  filter(coor >= start & coor <= end) %>%
  mutate(scaf_name = ifelse(coor >= start &
                              coor <= end, as.character(scaffold), scaf_name)) %>%
  select(c("scaf_name", "coor", "dinates.value"))