我有两个数据框:
data<-structure(list(sample = structure(c(1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 2L), .Label = c("S1", "S2"), class = "factor"), chrom = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = c("2L", "2R"), class = "factor"),
pos = c(318351L, 605574L, 1014043L, 2031592L, 2886957L, 2910379L,
2218351L, 105574L, 1344043L, 216957L)), .Names = c("sample",
"chrom", "pos"), row.names = c(NA, 10L), class = "data.frame")
> arrange(data, chrom,sample)
sample chrom pos
1 S1 2L 318351
2 S1 2L 2218351
3 S2 2L 605574
4 S2 2L 1014043
5 S2 2L 105574
6 S2 2L 1344043
7 S1 2R 2031592
8 S1 2R 2886957
9 S1 2R 2910379
10 S2 2R 216957
svBreaks<-structure(list(sample = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 1L), .Label = c("S1", "S2"), class = "factor"), chrom = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("2L", "2R"), class = "factor"),
bp = c(2425901L, 2426025L, 6694426L, 6694566L, 8387755L,
8387927L, 8963713L, 963799L, 980364L, 980521L), gene = structure(c(3L,
3L, 5L, 5L, 4L, 4L, 2L, 2L, 1L, 1L), .Label = c("CG8213",
"CG8216", "intergenic", "pdm3", "Tsp"), class = "factor"),
type = structure(c(2L, 1L, 2L, 1L, 3L, 3L, 3L, 4L, 4L, 3L
), .Label = c("DEL", "DUP", "INV", "TANDUP"), class = "factor")), row.names = c(NA,
10L), .Names = c("sample", "chrom", "bp", "gene", "type"), class = "data.frame")
> arrange(svBreaks, chrom, sample)
sample chrom bp gene type
1 S1 2L 2425901 intergenic DUP
2 S1 2L 2426025 intergenic DEL
3 S1 2L 6694426 Tsp DUP
4 S2 2L 6694566 Tsp DEL
5 S2 2L 8387755 pdm3 INV
6 S2 2L 8387927 pdm3 INV
7 S1 2R 8963713 CG8216 INV
8 S1 2R 963799 CG8216 TANDUP
9 S1 2R 980521 CG8213 INV
10 S2 2R 980364 CG8213 TANDUP
这是一个函数,用于计算pos
中data
与bp
svBreaks
中每个样本chrom
上fun3 <- function(p) {
index<-which.min(abs(sv_df$bp - p))
closestBp<-as.numeric(sv_df$bp[index])
chrom<-as.character(sv_df$chrom[index])
gene<-as.character(sv_df$gene[index])
sample<-as.character(sv_df$sample[index])
type<-as.character(sv_df$type[index])
dist<-(p-closestBp)
list(p, closestBp, dist, chrom, gene, type, sample)
}
之间的距离:
chrom
我想遍历每个sample
,并为每个sample
调用该函数,并构建一个包含所有chrom
和l <- list()
for (c in levels(data$chrom)){
for (s in levels(data$sample)){
# Filter in chromosome and sample
df<-filter(data, chrom == c & sample == s)
sv_df<-filter(svBreaks, chrom == c & sample == s)
# Apply funciton
dist2bp<-lapply(df$pos, fun3)
dist2bp<-do.call(rbind, dist2bp)
dist2bp<-as.data.frame(dist2bp)
colnames(dist2bp)=c("snp", "closest_bp", "min_dist", "chrom", "closest_gene", "type", "sample")
l[[s]] <- dist2bp
}
}
的数据框。这是我到目前为止所得到的:
chrom
但是,这并没有给我预期的输出,只保存了> levels(data$chrom)
[1] "2L" "2R"
> levels(data$sample)
[1] "S1" "S2"
snp closest_bp min_dist chrom closest_gene type sample
S1.1 2031592 980521 1051071 2R CG8213 INV S1
S1.2 2886957 980521 1906436 2R CG8213 INV S1
S1.3 2910379 980521 1929858 2R CG8213 INV S1
S2 216957 980364 -763407 2R CG8213 TANDUP S2
的第二级 - 这里是结果数据框:
chrom
如果我还添加到for (c in levels(data$chrom)){
for (s in levels(data$sample)){
[...]
l[[s]] <- dist2bp
}
l[[c]] <- dist2bp
}
snp closest_bp min_dist chrom closest_gene type sample
S1.1 2031592 980521 1051071 2R CG8213 INV S1 # Chrom level 2, S1
S1.2 2886957 980521 1906436 2R CG8213 INV S1
S1.3 2910379 980521 1929858 2R CG8213 INV S1
S2 216957 980364 -763407 2R CG8213 TANDUP S2 # Chrom level 2, S2
2L.1 605574 6694566 -6088992 2L Tsp DEL S2 # Chrom level 1, S2
2L.2 1014043 6694566 -5680523 2L Tsp DEL S2
2L.3 105574 6694566 -6588992 2L Tsp DEL S2
2L.4 1344043 6694566 -5350523 2L Tsp DEL S2
2R 216957 980364 -763407 2R CG8213 TANDUP S2 # Not sure why this is here, # Chrom level 1, S1 is missing
循环中的列表,输出也不正确:
[0] => 'Name':'...',
'Locality':'...',
'Date':'...',
'Address': [0] => '...',
[1] => '...',
[2] => '...',
...
...
有人能指出我正确的方向吗?