选择与R中两个数据帧之间的条件匹配的行

时间:2016-06-07 04:36:32

标签: r loops compare subset

我有一个基因组坐标文件(p)和另一个变种文件及其位置(vars)。我想得到起始位置大于基因组起始位置的变体,以及比基因组终点位置少的终点位置。变体文件的长度小于坐标文件。我不断得到这个错误,“较长的物体长度不是较短物体长度的倍数。”如果有人可以提供帮助,感谢您的帮助!!

编辑:这是数据的结构:

>dput(droplevels(head(p, 4)))
structure(list(chr = structure(c(1L, 1L, 1L, 1L), .Label = "chr13", class = "factor"), 
chrStart = c(19019000L, 19020000L, 19020000L, 19021000L), 
chrEnd = c(19020000L, 19020000L, 19021000L, 19021000L),Number = c(1L, 
29L, 53L, 60L)), .Names = c("chr", "chrStart", "chrEnd", 
"Number"), row.names = c(NA, 4L), class = "data.frame")

>dput(droplevels(head(chr13, 4)))
structure(list(Var = structure(1:4, .Label = c("13:23798029-23799959", 
"13:19019221-19019456", "13:19018226-19019462", "13:94818369-94822017"
), class = "factor"), Chr = c(13L, 13L, 13L, 13L), vStart = c(23798029L, 
85571820L, 94818226L, 94818369L), vEnd = c(23799959L, 85574142L, 
94822462L, 94822017L), CpG = structure(c(3L, 2L, 1L, 1L), .Label =c("cg17183991", 
"cg17921034", "cg26611683"), class = "factor"), Gene = structure(c(2L, 
1L, 3L, 3L), .Label = c("AKAP11", "FOXO1A", "HS6ST3"), class = "factor"), 
width = c(16338960L, 43828646L, 720767L, 720918L), p = c(0.424, 
0.418, 0.385, 0.338), X.NAME. = c(3.9026, 3.8357, 2.3456, 
2.583), X.NAME..1 = c(3.7245, 3.7267, 2.3467, 2.2076), X.NAME..2 = c(4.8623, 
4.7102, 3.2994, 3.1719), Sourc = structure(c(2L, 2L, 1L, 
2L), .Label = c("T1", "T2"), class = "factor"), 
NominalPvalue = c(0.023992, 0.0002875, 0.0049597, 
0.002612036)), .Names = c("Var", "Chr", "vStart", "vEnd", 
"CpG", "Gene", "width", "p", "X.NAME.", "X.NAME..1", 
"X.NAME..2", "Sourc", "Normalized"), row.names = c(29L, 278L, 
304L, 305L), class = "data.frame")







p <- read.csv("chr13.csv", header = TRUE)
vars <- read.csv("../variants.csv", header=T)
chr13 <-subset(vars, vars$Chr=="13")

for(i in 1:nrow(p)){
   curP <- p[i,]

   k <- subset(chr13, chr13$vStart > curP$chrStart & chr13$vEnd < curP$chrEnd)
   for(m in 1:nrow(k)){
     curM <- k[m, ]
     x <- as.data.frame(curM[,1:ncol(curM)])
     y <- as.data.frame(curP[,1:ncol(curP)])
    if (nrow(k)>0)
       keep <- data.frame(Variant=curM$Variant,
                     Chr=curM$Chr,
                     vStart=curM$vStart,
                     CpG=curM$Cpg,
                     Gene=curM$Gene,
                     vEnd=curM$vEnd,
                     ChrStart=curP$chrStart,
                     ChrEnd=curP$chrEnd
                     )
matches <- cbind(matches,keep)

}  }

文件1:

chr Start End Number
chr1    12001  13000      2
chr1    13000  13000     10
chr1    13010  14000      6
chr1    13020  15000      2
chr1    14000  15000     10
chr1    15000  15000      4
chr1    15300  16000     12
chr1    13000  51000      1
chr1    48000  52000      1
chr1    51000  52000      4

文件2:

variant  chr  chrStart chrEnd cpg    gene
var128   1    13467   13499   cg27611665   FBXL12  
var229   1    48117   48334   cg27611665   FBXL12 
var509   1    568289  568419  cg2511665    FBXL12
var213   1    186392  186392  cg2558303    SLC25A4
var999   1    401909  401963  cg27472032   VPS39
var122   1    182444  182494  cg2743794    FXR1
var098   1    602184  602248  cg27398547   C14orf39
var876   1    157302  157344  cg27355746   UBTF
var287   1    163665  163709  cg2752122    PHF20L1`

因此,最终结果应符合以下条件:

variant  chr  chrStart chrEnd CpG    gene  chr Start End Number
var128   1    13467   13499   cg27611665   FBXL12 chr1    13010  14000      6
var229   1    48117   48334   cg27611665   FBXL12 chr1    48000  52000      1

0 个答案:

没有答案