我不得不再次问一个我最近问过的类似问题here,但是我仍然没有得到答案,我有一个新的数据集,并且以前的解决方案很遗憾didint可以工作...
我有两个数据集:
Input = (" CHROM POS QUAL
1 chr10 10057508 223.60
2 chr10 10057509 223.60
3 chr10 10057514 223.60
300 chr17 18183700 1847.03
301 chr17 18377233 532.06
302 chr17 27975024 93.60
303 chr17 27975027 157.64
8000 chr12 6923083 217.60
8001 chr12 70920775 157.60
8002 chr12 70920776 157.60
8003 chr12 70920777 157.60")
df1 = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))
Input = (" chr start stop merged
1 chr1 9868 11868 chr1_9868_11868
852 chr1 2814998 2816998 chr1_2814998_2816998
117618 chr10 10056721 10058721 chr10_10056721_10058721
185773 chr17 18181827 18183827 chr17_18181827_18183827
185853 chr17 18375777 18377777 chr17_18375777_18377777
186710 chr17 27974223 27976223 chr17_27974223_27976223
139286 chr12 6922462 6924462 chr12_6922462_6924462
145955 chr12 70920738 70922738 chr12_70920738_70922738
66558 chr5 132871443 132873443 chr5_132871443_132873443
78653 chr6 125818858 125820858 chr6_125818858_125820858")
df2 = as.data.frame(read.table(textConnection(Input), header = T, row.names = 1))
我想将本地化ID的-merged
列df2放入df1中的相应行。 CHROM
和chr
必须相等,并且POS
必须在df2的终止点start
内。
所以期望的输出应该是我手动完成的,因此可能会出错:
CHROM POS QUAL merged
1 chr10 10057508 223.60 chr10_10056721_10058721
2 chr10 10057509 223.60 chr10_10056721_10058721
3 chr10 10057514 223.60 chr10_10056721_10058721
300 chr17 18183700 1847.03 chr17_18181827_18183827
301 chr17 18377233 532.06 chr17_18375777_18377777
302 chr17 27975024 93.60 chr17_27974223_27976223
303 chr17 27975027 157.64 chr17_27974223_27976223
8000 chr12 6923083 217.60 chr12_6922462_6924462
8001 chr12 70920775 157.60 chr12_70920738_70922738
8002 chr12 70920776 157.60 chr12_70920738_70922738
8003 chr12 70920777 157.60 chr12_70920738_70922738
当然,df2 merged
中存在的两个或多个ID可能适合df1中的一行。然后,行可能会使用不同的新添加的ID进行重复,或使用逗号列出。
我做了类似的事情,但是效果不佳,现在我正在尝试对其进行修复,但是我愿意接受您的帮助。
df1$merged <- sapply(1:nrow(df1), function(x) sapply(1:nrow(df2),function(y) ifelse(df1[x,1] == df2[y,1] & (df1[x,2] > df2[y,2] & df1[x,2] < df2[y,3]), df2[y,4], NA)))
答案 0 :(得分:2)
使用sqldf
进行条件连接的另一个选项:
library(sqldf)
sqldf("SELECT df1.*, df2.merged FROM df1 LEFT JOIN df2 on df1.CHROM = df2.chr AND df1.POS BETWEEN df2.start AND df2.stop")
CHROM POS QUAL merged
1 chr10 10057508 223.60 chr10_10056721_10058721
2 chr10 10057509 223.60 chr10_10056721_10058721
3 chr10 10057514 223.60 chr10_10056721_10058721
4 chr17 18183700 1847.03 chr17_18181827_18183827
5 chr17 18377233 532.06 chr17_18375777_18377777
6 chr17 27975024 93.60 chr17_27974223_27976223
7 chr17 27975027 157.64 chr17_27974223_27976223
8 chr12 6923083 217.60 chr12_6922462_6924462
9 chr12 70920775 157.60 chr12_70920738_70922738
10 chr12 70920776 157.60 chr12_70920738_70922738
11 chr12 70920777 157.60 chr12_70920738_70922738
答案 1 :(得分:1)
这似乎没有ID1的重复ID的情况。但是,您要对重复的ID2做什么逻辑?
library(tidyverse)
Input = ("ID1 CHROM POS QUAL
1 chr10 10057508 223.60
2 chr10 10057509 223.60
3 chr10 10057514 223.60
300 chr17 18183700 1847.03
301 chr17 18377233 532.06
302 chr17 27975024 93.60
303 chr17 27975027 157.64
8000 chr12 6923083 217.60
8001 chr12 70920775 157.60
8002 chr12 70920776 157.60
8003 chr12 70920777 157.60")
df1 = as.data.frame(read.table(textConnection(Input), header = T, row.names = NULL))
Input = ("ID2 chr start stop merged
1 chr1 9868 11868 chr1_9868_11868
852 chr1 2814998 2816998 chr1_2814998_2816998
117618 chr10 10056721 10058721 chr10_10056721_10058721
185773 chr17 18181827 18183827 chr17_18181827_18183827
185853 chr17 18375777 18377777 chr17_18375777_18377777
186710 chr17 27974223 27976223 chr17_27974223_27976223
139286 chr12 6922462 6924462 chr12_6922462_6924462
145955 chr12 70920738 70922738 chr12_70920738_70922738
66558 chr5 132871443 132873443 chr5_132871443_132873443
78653 chr6 125818858 125820858 chr6_125818858_125820858")
df2 = as.data.frame(read.table(textConnection(Input), header = T, row.names = NULL))
data<-left_join(df1,df2,by=c("CHROM"="chr")) %>%
filter(POS >= start & POS <=stop)