我有一个基因组坐标文件(p)和另一个变种文件及其位置(vars)。我想得到起始位置大于基因组起始位置的变体,以及比基因组终点位置少的终点位置。变体文件的长度小于坐标文件。我不断得到这个错误,“较长的物体长度不是较短物体长度的倍数。”如果有人可以提供帮助,感谢您的帮助!!
编辑:这是数据的结构:
>dput(droplevels(head(p, 4)))
structure(list(chr = structure(c(1L, 1L, 1L, 1L), .Label = "chr13", class = "factor"),
chrStart = c(19019000L, 19020000L, 19020000L, 19021000L),
chrEnd = c(19020000L, 19020000L, 19021000L, 19021000L),Number = c(1L,
29L, 53L, 60L)), .Names = c("chr", "chrStart", "chrEnd",
"Number"), row.names = c(NA, 4L), class = "data.frame")
>dput(droplevels(head(chr13, 4)))
structure(list(Var = structure(1:4, .Label = c("13:23798029-23799959",
"13:19019221-19019456", "13:19018226-19019462", "13:94818369-94822017"
), class = "factor"), Chr = c(13L, 13L, 13L, 13L), vStart = c(23798029L,
85571820L, 94818226L, 94818369L), vEnd = c(23799959L, 85574142L,
94822462L, 94822017L), CpG = structure(c(3L, 2L, 1L, 1L), .Label =c("cg17183991",
"cg17921034", "cg26611683"), class = "factor"), Gene = structure(c(2L,
1L, 3L, 3L), .Label = c("AKAP11", "FOXO1A", "HS6ST3"), class = "factor"),
width = c(16338960L, 43828646L, 720767L, 720918L), p = c(0.424,
0.418, 0.385, 0.338), X.NAME. = c(3.9026, 3.8357, 2.3456,
2.583), X.NAME..1 = c(3.7245, 3.7267, 2.3467, 2.2076), X.NAME..2 = c(4.8623,
4.7102, 3.2994, 3.1719), Sourc = structure(c(2L, 2L, 1L,
2L), .Label = c("T1", "T2"), class = "factor"),
NominalPvalue = c(0.023992, 0.0002875, 0.0049597,
0.002612036)), .Names = c("Var", "Chr", "vStart", "vEnd",
"CpG", "Gene", "width", "p", "X.NAME.", "X.NAME..1",
"X.NAME..2", "Sourc", "Normalized"), row.names = c(29L, 278L,
304L, 305L), class = "data.frame")
p <- read.csv("chr13.csv", header = TRUE)
vars <- read.csv("../variants.csv", header=T)
chr13 <-subset(vars, vars$Chr=="13")
for(i in 1:nrow(p)){
curP <- p[i,]
k <- subset(chr13, chr13$vStart > curP$chrStart & chr13$vEnd < curP$chrEnd)
for(m in 1:nrow(k)){
curM <- k[m, ]
x <- as.data.frame(curM[,1:ncol(curM)])
y <- as.data.frame(curP[,1:ncol(curP)])
if (nrow(k)>0)
keep <- data.frame(Variant=curM$Variant,
Chr=curM$Chr,
vStart=curM$vStart,
CpG=curM$Cpg,
Gene=curM$Gene,
vEnd=curM$vEnd,
ChrStart=curP$chrStart,
ChrEnd=curP$chrEnd
)
matches <- cbind(matches,keep)
} }
文件1:
chr Start End Number
chr1 12001 13000 2
chr1 13000 13000 10
chr1 13010 14000 6
chr1 13020 15000 2
chr1 14000 15000 10
chr1 15000 15000 4
chr1 15300 16000 12
chr1 13000 51000 1
chr1 48000 52000 1
chr1 51000 52000 4
文件2:
variant chr chrStart chrEnd cpg gene
var128 1 13467 13499 cg27611665 FBXL12
var229 1 48117 48334 cg27611665 FBXL12
var509 1 568289 568419 cg2511665 FBXL12
var213 1 186392 186392 cg2558303 SLC25A4
var999 1 401909 401963 cg27472032 VPS39
var122 1 182444 182494 cg2743794 FXR1
var098 1 602184 602248 cg27398547 C14orf39
var876 1 157302 157344 cg27355746 UBTF
var287 1 163665 163709 cg2752122 PHF20L1`
因此,最终结果应符合以下条件:
variant chr chrStart chrEnd CpG gene chr Start End Number
var128 1 13467 13499 cg27611665 FBXL12 chr1 13010 14000 6
var229 1 48117 48334 cg27611665 FBXL12 chr1 48000 52000 1