我是R语言的新手。我有一个包含很多参考和很多示例的数据框。我想编写函数来计算变体的数量,例如,如果样本的基因型等于ref列,则为0,如果等于het列,则为1,如果等于风险列,它就到了2。
df:
SNP ref het risk Sample1 Sample2 ...
rs1 GG AG AA AG GG
rs2 AA AG GG AG AA
rs3 AA AG GG AG AG
rs4 GG AG AA AG AA
rs5 GG AG AA AG AA
rs6 GG AG AA AG AG
rs7 AA AG GG AA AA
rs8 CC AC AA AC CC
rs9 GG AG AA GG GG
rs10 GG AG AA GG AG
rs11 AA AG GG AA GG
rs12 GG AG AA AA AG
rs13 GG AG AA AG AA
rs14 AA AG GG AG AA
rs15 GG AG AA AA AA
rs16 AA AC CC AA AA
rs17 AA AG GG AA AA
rs18 GG AG AA GG GG
rs19 GG AG AA GG AG
rs20 GG AG AA AG AG
...
desired output:
SNP ref het risk Sample1 Sample2 Sample1.vd Sample2.vd ...
rs1 GG AG AA AG GG 1 0
rs2 AA AG GG AG AA 1 0
rs3 AA AG GG AG AG 1 1
rs4 GG AG AA AG AA 1 2
rs5 GG AG AA AG AA 1 2
rs6 GG AG AA AG AG 1 1
rs7 AA AG GG AA AA 0 0
rs8 CC AC AA AC CC 1 0
rs9 GG AG AA GG GG 0 0
rs10 GG AG AA GG AG 0 1
rs11 AA AG GG AA GG 0 2
rs12 GG AG AA AA AG 2 1
rs13 GG AG AA AG AA 1 2
rs14 AA AG GG AG AA 1 0
rs15 GG AG AA AA AA 2 2
rs16 AA AC CC AA AA 0 0
rs17 AA AG GG AA AA 0 0
rs18 GG AG AA GG GG 0 0
rs19 GG AG AA GG AG 0 1
rs20 GG AG AA AG AG 1 1
...
我尝试编写一个函数,然后使用Apply函数。
VariantDetected <- function(df) {
x <- which(df[5:length(df)] == df[,c("ref","het","risk_hom")])
return(x)
}
apply(df, 1, VariantDetected)
但是它带有错误,有什么建议吗?谢谢。
答案 0 :(得分:2)
使用dplyr软件包更容易,因为它使您的代码更具可读性。如果您不介意列的数据类型,则可以删除函数中的倒数第二行。希望对您有帮助。
#Needed library---------
library(dplyr)
# Your function------------
VariantDetected <- function(dataset) {
df1 <- data.frame(sapply(dataset, function(x) as.character(x)),stringsAsFactors = F)
df1 <- df1 %>% mutate(Sample1.vd = ifelse(Sample1 == ref,0,ifelse(Sample1 == het,1,2)),
Sample2.vd = ifelse(Sample2 == ref,0,ifelse(Sample2 == het,1,2)))
df1[1:6] <- data.frame(sapply(df1[1:6], function(x) as.factor(x))) # you can delete this line if you dont mind the first 6 columns to be charecters
return(df1)
}
#execute it on your dataset-----------
df <- VariantDetected(df)
致谢/ Revanth Nemani
答案 1 :(得分:0)
我认为您需要在此处嵌套ifelse
df[paste0("Sample", seq_along(5:ncol(df)), ".vd")] <- t(apply(df, 1, function(x)
ifelse(x[5:length(x)] == x["ref"], 0,
ifelse(x[5:length(x)] == x["het"], 1, 2))))
df
# SNP ref het risk Sample1 Sample2 Sample1.vd Sample2.vd
#1 rs1 GG AG AA AG GG 1 0
#2 rs2 AA AG GG AG AA 1 0
#3 rs3 AA AG GG AG AG 1 1
#4 rs4 GG AG AA AG AA 1 2
#5 rs5 GG AG AA AG AA 1 2
#6 rs6 GG AG AA AG AG 1 1
#7 rs7 AA AG GG AA AA 0 0
#8 rs8 CC AC AA AC CC 1 0
#9 rs9 GG AG AA GG GG 0 0
#10 rs10 GG AG AA GG AG 0 1
#....
或者如果您想将其用作功能
VariantDetected <- function(x) {
ifelse(x[5:length(x)] == x["ref"], 0,
ifelse(x[5:length(x)] == x["het"], 1, 2))
}
df[paste0("Sample", seq_along(5:ncol(df)), ".vd")]<-t(apply(df, 1, VariantDetected))
数据
df <- structure(list(SNP = structure(c(1L, 12L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label =
c("rs1",
"rs10", "rs11", "rs12", "rs13", "rs14", "rs15", "rs16", "rs17",
"rs18", "rs19", "rs2", "rs20", "rs3", "rs4", "rs5", "rs6", "rs7",
"rs8", "rs9"), class = "factor"), ref = structure(c(3L, 1L, 1L,
3L, 3L, 3L, 1L, 2L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 3L,
3L), .Label = c("AA", "CC", "GG"), class = "factor"), het =
structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L), .Label = c("AC", "AG"), class = "factor"), risk =
structure(c(1L,
3L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 2L, 3L,
1L, 1L, 1L), .Label = c("AA", "CC", "GG"), class = "factor"),
Sample1 = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 4L,
4L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 3L), .Label = c("AA",
"AC", "AG", "GG"), class = "factor"), Sample2 = structure(c(4L,
1L, 2L, 1L, 1L, 2L, 1L, 3L, 4L, 2L, 4L, 2L, 1L, 1L, 1L, 1L,
1L, 4L, 2L, 2L), .Label = c("AA", "AG", "CC", "GG"), class = "factor"),
Sample1.vd = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1,
2, 0, 0, 0, 0, 1), Sample2.vd = c(0, 0, 1, 2, 2, 1, 0, 0,
0, 1, 2, 1, 2, 0, 2, 0, 0, 0, 1, 1)), row.names = c(NA, -20L
), .Names = c("SNP", "ref", "het", "risk", "Sample1", "Sample2",
"Sample1.vd", "Sample2.vd"), class = "data.frame")