比较两个数据帧,如何根据条件和范围将ID从一个df添加到第二个

时间:2019-10-21 11:33:57

标签: r dataframe

我不得不再次问一个我最近问过的类似问题here,但是我仍然没有得到答案,我有一个新的数据集,并且以前的解决方案很遗憾didint可以工作...

我有两个数据集:

Input = ("     CHROM      POS    QUAL
1    chr10 10057508  223.60
2    chr10 10057509  223.60
3    chr10 10057514  223.60
300  chr17 18183700 1847.03
301  chr17 18377233  532.06
302  chr17 27975024   93.60
303  chr17 27975027  157.64
8000 chr12  6923083  217.60
8001 chr12 70920775  157.60
8002 chr12 70920776  157.60
8003 chr12 70920777  157.60")
df1 = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))

Input = ("         chr     start      stop                  merged
1       chr1      9868     11868          chr1_9868_11868
852     chr1   2814998   2816998     chr1_2814998_2816998
117618 chr10  10056721  10058721  chr10_10056721_10058721
185773 chr17  18181827  18183827  chr17_18181827_18183827
185853 chr17  18375777  18377777  chr17_18375777_18377777
186710 chr17  27974223  27976223  chr17_27974223_27976223
139286 chr12   6922462   6924462    chr12_6922462_6924462
145955 chr12  70920738  70922738  chr12_70920738_70922738
66558   chr5 132871443 132873443 chr5_132871443_132873443
78653   chr6 125818858 125820858 chr6_125818858_125820858")
df2 = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))

我想将本地化ID的-merged列df2放入df1中的相应行。 CHROMchr必须相等,并且POS必须在df2的终止点start内。 所以期望的输出应该是我手动完成的,因此可能会出错:

     CHROM      POS    QUAL merged
1    chr10 10057508  223.60 chr10_10056721_10058721
2    chr10 10057509  223.60 chr10_10056721_10058721
3    chr10 10057514  223.60 chr10_10056721_10058721
300  chr17 18183700 1847.03 chr17_18181827_18183827
301  chr17 18377233  532.06 chr17_18375777_18377777
302  chr17 27975024   93.60 chr17_27974223_27976223
303  chr17 27975027  157.64 chr17_27974223_27976223
8000 chr12  6923083  217.60 chr12_6922462_6924462
8001 chr12 70920775  157.60 chr12_70920738_70922738
8002 chr12 70920776  157.60 chr12_70920738_70922738
8003 chr12 70920777  157.60 chr12_70920738_70922738

当然,df2 merged中存在的两个或多个ID可能适合df1中的一行。然后,行可能会使用不同的新添加的ID进行重复,或使用逗号列出。

我做了类似的事情,但是效果不佳,现在我正在尝试对其进行修复,但是我愿意接受您的帮助。

df1$merged <- sapply(1:nrow(df1), function(x) sapply(1:nrow(df2),function(y) ifelse(df1[x,1] == df2[y,1] & (df1[x,2] > df2[y,2] & df1[x,2] < df2[y,3]), df2[y,4], NA)))

2 个答案:

答案 0 :(得分:2)

使用sqldf进行条件连接的另一个选项:

library(sqldf)

sqldf("SELECT df1.*, df2.merged FROM df1 LEFT JOIN df2 on df1.CHROM = df2.chr AND df1.POS BETWEEN df2.start AND df2.stop")


   CHROM      POS    QUAL                  merged
1  chr10 10057508  223.60 chr10_10056721_10058721
2  chr10 10057509  223.60 chr10_10056721_10058721
3  chr10 10057514  223.60 chr10_10056721_10058721
4  chr17 18183700 1847.03 chr17_18181827_18183827
5  chr17 18377233  532.06 chr17_18375777_18377777
6  chr17 27975024   93.60 chr17_27974223_27976223
7  chr17 27975027  157.64 chr17_27974223_27976223
8  chr12  6923083  217.60   chr12_6922462_6924462
9  chr12 70920775  157.60 chr12_70920738_70922738
10 chr12 70920776  157.60 chr12_70920738_70922738
11 chr12 70920777  157.60 chr12_70920738_70922738

答案 1 :(得分:1)

这似乎没有ID1的重复ID的情况。但是,您要对重复的ID2做什么逻辑?

library(tidyverse)

Input = ("ID1  CHROM      POS    QUAL
1    chr10 10057508  223.60
2    chr10 10057509  223.60
3    chr10 10057514  223.60
300  chr17 18183700 1847.03
301  chr17 18377233  532.06
302  chr17 27975024   93.60
303  chr17 27975027  157.64
8000 chr12  6923083  217.60
8001 chr12 70920775  157.60
8002 chr12 70920776  157.60
8003 chr12 70920777  157.60")
df1 = as.data.frame(read.table(textConnection(Input), header = T, row.names = NULL))

Input = ("ID2    chr     start      stop                  merged
1       chr1      9868     11868          chr1_9868_11868
852     chr1   2814998   2816998     chr1_2814998_2816998
117618 chr10  10056721  10058721  chr10_10056721_10058721
185773 chr17  18181827  18183827  chr17_18181827_18183827
185853 chr17  18375777  18377777  chr17_18375777_18377777
186710 chr17  27974223  27976223  chr17_27974223_27976223
139286 chr12   6922462   6924462    chr12_6922462_6924462
145955 chr12  70920738  70922738  chr12_70920738_70922738
66558   chr5 132871443 132873443 chr5_132871443_132873443
78653   chr6 125818858 125820858 chr6_125818858_125820858")
df2 = as.data.frame(read.table(textConnection(Input), header = T, row.names = NULL))


data<-left_join(df1,df2,by=c("CHROM"="chr")) %>%
  filter(POS >= start & POS <=stop)
相关问题