我有两个数据框:一个包含一个SNP及其位置的列表,另一个包含一个基因及其起始和结束坐标的列表。 使用dplyr ,我想在SNP数据框中添加一列,其名称为每个SNP所属的基因的名称(即SNP的位置在同一染色体上,并且位于基因的开始/结束坐标(含)。
如果SNP不在任何基因坐标内,则应在“基因”列中获得“ NA”。 SNP和基因之间的染色体数必须匹配。例如,即使第二个SNP的位置落在Gene4的开始/结束坐标之内,但这也不匹配,因为它们位于不同的染色体上。
SNP数据框:
CHR POS REF ALT
01 5 C T
01 10 G A
02 5 G T
02 15 C A
02 20 T C
03 10 A G
03 20 C T
GENE数据框:
CHR START END GENE_NAME
01 2 8 Gene1
01 12 20 Gene2
01 25 30 Gene3
02 10 18 Gene4
02 25 35 Gene5
03 5 15 Gene6
所需的输出:
CHR POS REF ALT GENE_NAME
01 5 C T Gene1
01 10 G A NA
02 5 G T NA
02 15 C A Gene4
02 20 T C NA
03 10 A G Gene6
03 20 C T NA
同样,我想使用dplyr完成此操作。预先感谢您的帮助!
答案 0 :(得分:3)
使用purrr
中的GENE
的一个选项是根据POS
和CHR
中的SNP
过滤GENE_NAME
数据帧,然后选择相应的library(dplyr)
library(purrr)
SNP %>%
mutate(GENE_NAME = map2_chr(POS, CHR, function(x, y) {
inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
}))
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>
。
mapply
在基数R中,可以使用mapply(function(x, y) {
inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
}, SNP$POS, SNP$CHR)
#[1] "Gene1" NA NA "Gene4" NA "Gene6" NA
SNP <- structure(list(CHR = c(1L, 1L, 2L, 2L, 2L, 3L, 3L), POS = c(5L,
10L, 5L, 15L, 20L, 10L, 20L), REF = c("C", "G", "G", "C", "T",
"A", "C"), ALT = c("T", "A", "T", "A", "C", "G", "T")), class =
"data.frame", row.names = c(NA, -7L))
GENE <- structure(list(CHR = c(1L, 1L, 1L, 2L, 2L, 3L), START = c(2L,
12L, 25L, 10L, 25L, 5L), END = c(8L, 20L, 30L, 18L, 35L, 15L),
GENE_NAME = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5",
"Gene6")), class = "data.frame", row.names = c(NA, -6L))
数据
{{1}}
答案 1 :(得分:3)
这是使用 class ViewController: UIViewController {
@IBOutlet weak var level_Battery: UILabel!
override func viewDidLoad() {
super.viewDidLoad()
UIDevice.current.isBatteryMonitoringEnabled = true
let level = UIDevice.current.batteryLevel
let battery_Level = Int(level * 100)
level_Battery.text = "\(battery_Level)%"
}
}
的一种方法。您只需扩展基于dplyr
和gene
的{{1}}数据帧,然后使用START
-{p>扩展END
left_join
答案 2 :(得分:2)
这是将Public Function Levenshtein(s1 As String, s2 As String)
Dim i As Integer, j As Integer
Dim l1 As Integer, l2 As Integer
Dim min1 As Integer, min2 As Integer
Dim d() As Integer
'For debugging purposes only
Cells.Clear
Dim rngOutput As Range: Set rngOutput = ActiveSheet.Range("A1").Resize(Len(s1) + 2, Len(s2) + 2)
With rngOutput
.ColumnWidth = 3
.HorizontalAlignment = xlCenter
.VerticalAlignment = xlCenter
End With
l1 = Len(s1): l2 = Len(s2): ReDim d(l1, l2)
For i = 0 To l1
d(i, 0) = i
With rngOutput
.Cells(i + 3, 1) = Mid(s1, i + 1, 1)
If Not i = 0 Then .Cells(i + 2, 2) = i
End With
Next i
For j = 0 To l2
d(0, j) = j
With rngOutput
.Cells(1, j + 3) = Mid(s2, j + 1, 1)
If Not j = 0 Then .Cells(2, j + 2) = j
End With
Next j
For i = 1 To l1
For j = 1 To l2
If Mid(s1, i, 1) = Mid(s2, j, 1) Then
d(i, j) = d(i - 1, j - 1)
With rngOutput.Cells(i + 2, j + 2)
.Value = d(i, j)
.Font.Color = vbBlue
End With
Else
min1 = d(i - 1, j) + 1
min2 = d(i, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
min2 = d(i - 1, j - 1) + 1
If min2 < min1 Then
min1 = min2
End If
d(i, j) = min1
With Cells(i + 2, j + 2)
.Value = d(i, j)
.Font.Color = vbRed
End With
End If
Next
Next
Levenshtein = d(l1, l2)
End Function
与non-equi
结合使用的一种选择
data.table
或与library(data.table)
setDT(snp)[gene, GENE_NAME := GENE_NAME, on = .(CHR, POS >= START, POS <= END)]
snp
# CHR POS REF ALT GENE_NAME
#1: 1 5 C T Gene1
#2: 1 10 G A <NA>
#3: 2 5 G T <NA>
#4: 2 15 C A Gene4
#5: 2 20 T C <NA>
#6: 3 10 A G Gene6
#7: 3 20 C T <NA>
fuzzyjoin
或带有library(fuzzyjoin)
library(dplyr)
fuzzy_left_join(snp, gene, by = c("CHR", "POS" = "START",
"POS" = "END"), match_fun = list(`==`, `>=`, `<=`)) %>%
select(CHR = CHR.x, POS, REF, ALT, GENE_NAME)
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>
的选项
rap
library(rap)
snp %>%
rap(GENE_NAME = ~ filter(gene, CHR == !!CHR, START <= POS, END >= POS) %>%
pull(GENE_NAME)) %>%
mutate(GENE_NAME = replace(GENE_NAME, !lengths(GENE_NAME), NA)) %>%
unnest
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>