StackOverflow问题
Hello研究员,
我正试图"交叉"带有R的多个数据帧。
我的数据框来自高通量测序实验,如下所示:
df1:
chr pos orient weight in_nucleosome in_subtelo
1 NC_001133 999 + 1 TRUE TRUE
2 NC_001133 1505 - 14 FALSE TRUE
3 NC_001133 1525 - 2 TRUE TRUE
4 NC_001134 480 + 1 TRUE TRUE
5 NC_001134 509 + 2 FALSE TRUE
6 NC_001134 539 + 3 FALSE TRUE
7 NC_001135 1218 + 1 TRUE TRUE
8 NC_001135 1228 + 2 TRUE TRUE
9 NC_001135 1273 + 1 TRUE TRUE
10 NC_001136 362 + 1 TRUE TRUE
和
DF2:
chr feature start end orient
1 NC_001133 ARS 707 776 .
2 NC_001133 ARS 7997 8547 .
3 NC_001133 ARS 30946 31183 .
4 NC_001133 ARS_consensus_sequence 31002 31018 +
5 NC_001133 ARS_consensus_sequence 70418 70434 -
6 NC_001133 ARS_consensus_sequence 124463 124479 -
7 NC_001136 blocked_reading_frame 721071 721481 -
8 NC_001137 blocked_reading_frame 375215 377614 -
9 NC_001141 blocked_reading_frame 29032 30048 +
10 NC_001133 CDS 335 649 +
我想要做的是知道一个给定的染色体(" chr"这里)和每个df2 $特征是否(df2 $ start< df1 $ pos< df2 $ end) 。然后我想在df1中添加一个列,其名称将是所考虑的df2feature之一,并且就前面陈述的条件填充为TRUE或FALSE。
我很确定必须使用apply函数系列可能嵌套在一个antoher中但是经过几个小时的尝试后我无法设法完成它。
我使用嵌套for循环以非常不优雅,冗长且容易出错的方式做到了,但我确信有更好的更简单,也许更快的解决方案。
感谢您阅读本文,
安托。
答案 0 :(得分:0)
虽然可能dplyr
(我试过但不是那么精通),但是我用foreach
和iterators
来实现它(我认为):
您的数据:
df1 <- structure(list(chr = c("NC_001133", "NC_001133", "NC_001133", "NC_001134", "NC_001134", "NC_001134", "NC_001135", "NC_001135", "NC_001135", "NC_001136"),
pos = c(999L, 1505L, 1525L, 480L, 509L, 539L, 1218L, 1228L, 1273L, 362L),
orient = c("+", "-", "-", "+", "+", "+", "+", "+", "+", "+"),
weight = c(1L, 14L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L),
in_nucleosome = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE),
in_subtelo = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)),
.Names = c("chr", "pos", "orient", "weight", "in_nucleosome", "in_subtelo"),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
df2 <- structure(list(chr = c("NC_001133", "NC_001133", "NC_001133", "NC_001133", "NC_001133", "NC_001133", "NC_001136", "NC_001137", "NC_001141", "NC_001133"),
feature = c("ARS", "ARS", "ARS", "ARS_consensus_sequence", "ARS_consensus_sequence", "ARS_consensus_sequence", "blocked_reading_frame", "blocked_reading_frame", "blocked_reading_frame", "CDS"),
start = c(707L, 7997L, 30946L, 31002L, 70418L, 124463L, 721071L, 375215L, 29032L, 335L),
end = c(776L, 8547L, 31183L, 31018L, 70434L, 124479L, 721481L, 377614L, 30048L, 649L),
orient = c(".", ".", ".", "+", "-", "-", "-", "-", "+", "+")),
.Names = c("chr", "feature", "start", "end", "orient"),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
由于我认为您的数据没有任何匹配,我会注入一些:
## to be able to find *something*
df1$pos <- c(999, 1505, 8000, 480, 509, 539, 1218, 1228, 1272, 721072)
代码:
library(foreach)
library(iterators)
## pre-populate df1 with necessary columns
for (col in unique(df2$feature)) df1[,col] <- FALSE
df1a <- foreach (subdf1 = iter(df1, by='row'), .combine=rbind) %do% {
features <- unique(df2$feature[df2$chr== subdf1$chr])
for (feature in features) {
idx <- (df2$chr == subdf1$chr) & (feature == df2$feature)
if (length(idx)) {
subdf1[feature] <- any((df2$start[idx] < subdf1$pos) & (subdf1$pos < df2$end[idx]))
}
}
subdf1
}
df1a
## chr pos orient weight in_nucleosome in_subtelo ARS
## 1 NC_001133 999 + 1 TRUE TRUE FALSE
## 2 NC_001133 1505 - 14 FALSE TRUE FALSE
## 3 NC_001133 8000 - 2 TRUE TRUE TRUE
## 4 NC_001134 480 + 1 TRUE TRUE FALSE
## 5 NC_001134 509 + 2 FALSE TRUE FALSE
## 6 NC_001134 539 + 3 FALSE TRUE FALSE
## 7 NC_001135 1218 + 1 TRUE TRUE FALSE
## 8 NC_001135 1228 + 2 TRUE TRUE FALSE
## 9 NC_001135 1272 + 1 TRUE TRUE FALSE
## 10 NC_001136 721072 + 1 TRUE TRUE FALSE
## ARS_consensus_sequence blocked_reading_frame CDS
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE FALSE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE FALSE FALSE
## 6 FALSE FALSE FALSE
## 7 FALSE FALSE FALSE
## 8 FALSE FALSE FALSE
## 9 FALSE FALSE FALSE
## 10 FALSE TRUE FALSE
使用foreach
和iterators
的一个简单的副作用是,如果数据很大并且您使用doParallel
,则只需将%do%
替换为%dopar%
并且事情和你定义的那样平行。您可以使用以下内容作为上述所有内容的序言:
library(doParallel)
cl <- makeCluster(detectCores() - 1) # leaving one available is "A Good Thing (tm)"
registerDoParallel(cl)
## replace %do% with %dopar%, do all of the above code
## clean up
stopCluster(cl)