在另一个数据表中基于NA替换一个数据表中的值

时间:2016-07-07 12:48:54

标签: r lapply

我正在尝试比较几个重叠的NP /个体的基因分型数据。 正如您在下面的数据结构中所看到的,e[1,2]e[2,3]都有NA' s。现在我想用NA值替换d[1,2](1)和d[2,3](1)。

d <- structure(list(`100099681` = c(0L, 2L, 0L), `101666591` = c(1L, 1L, 0L), `102247652` = c(1L, 1L, 1L), `102284616` = c(0L, 1L, 0L), `103582612` = c(0L, 1L, 1L), `104344528` = c(2L, 1L, 0L),     `105729734` = c(1L, 0L, 1L), `109897137` = c(0L, 0L, 2L),     `112768301` = c(0L, 1L, 1L), `114724443` = c(1L, 1L, 1L),     `114826164` = c(1L, 0L, 1L), `115358770` = c(0L, 2L, 0L),     `115399788` = c(1L, 1L, 0L), `118669033` = c(0L, 1L, 1L),     `118875482` = c(2L, 1L, 0L), `119366362` = c(0L, 2L, 0L),     `119627971` = c(0L, 1L, 1L), `120295351` = c(0L, 2L, 0L),     `120998030` = c(0L, 0L, 2L)), .Names = c("100099681", "101666591", "102247652", "102284616", "103582612", "104344528", "105729734", "109897137", "112768301", "114724443", "114826164", "115358770", "115399788", "118669033", "118875482", "119366362", "119627971", "120295351", "120998030"), row.names = c("7:100038150_C", "7:100079759_T", "7:100256942_A"), class = "data.frame") 
> d
#             100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351 120998030
#7:100038150_C         0         1         1         0         0         2         1         0         0         1         1         0         1         0         2         0         0         0        0
#7:100079759_T         2         1         1         1         1         1         0         0         1         1         0         2         1         1         1         2         1         2        0
#7:100256942_A         0         0         1         0         1         0         1         2         1         1         1         0         0         1         0         0         1         0        2

e<- structure(list(`100099681` = c(1L, 1L, 0L), `101666591` = c(NA, 1L, 1L), `102247652` = c(0L, NA, 0L), `102284616` = c(1L, 1L, 0L), `103582612` = c(1L, 0L, 1L), `104344528` = c(1L, 0L, 1L),     `105729734` = c(0L, 0L, 1L), `109897137` = c(1L, 1L, 0L),     `112768301` = c(0L, 1L, 1L), `114724443` = c(0L, 2L, 0L),     `114826164` = c(0L, 0L, 2L), `115358770` = c(0L, 0L, 2L),     `115399788` = c(0L, 2L, 0L), `118669033` = c(0L, 0L, 2L),     `118875482` = c(0L, 1L, 1L), `119366362` = c(2L, 1L, 0L),     `119627971` = c(0L, 1L, 1L), `120295351` = c(0L, 2L, 0L),     `120998030` = c(0L, 2L, 1L)), .Names = c("100099681", "101666591", "102247652", "102284616", "103582612", "104344528", "105729734", "109897137", "112768301", "114724443", "114826164", "115358770", "115399788", "118669033", "118875482", "119366362", "119627971", "120295351", "120998030"), row.names = c("7:100038150_C", "7:100079759_T", "7:100256942_A"), class = "data.frame")  
> e
#             100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351   120998030
#7:100038150_C         1        NA         0         1         1         1         0         1         0         0         0         0         0         0         0         2         0         0          0
#7:100079759_T         1         1        NA         1         0         0         0         1         1         2         0         0         2         0         1         1         1         2          2
#7:100256942_A         0         1         0         0         1         1         1         0         1         0         2         2         0         2         1         0         1         0          1

因此我的预期输出将是

> expected_d
#             100099681 101666591 102247652 102284616 103582612 104344528 105729734 109897137 112768301 114724443 114826164 115358770 115399788 118669033 118875482 119366362 119627971 120295351 120998030
#7:100038150_C         0         NA         1         0         0         2         1         0         0         1         1         0         1         0         2         0         0         0        0
#7:100079759_T         2         1         NA         1         1         1         0         0         1         1         0         2         1         1         1         2         1         2        0
#7:100256942_A         0         0          1         0         1         0         1         2         1         1         1         0         0         1         0         0         1         0        2

我已经走到了这一步;

g <- which(is.na(e), arr.ind=TRUE)
> g
#              row col
#7:100038150_C   1   2
#7:100079759_T   2   3

然后尝试使用apply函数替换&#34; TEST&#34; (或na那个问题)

apply(g, 1, function(x){   
    e[x[1], x[2]] <- "TEST" }
)
#>  apply(g, 1, function(x){   e[x[1], x[2]] <- "TEST" })
#7:100038150_C 7:100079759_T
#       "TEST"        "TEST"    

我将在几百万行/列上运行这段代码,因此速度将是一个问题。 提前谢谢你:)

2 个答案:

答案 0 :(得分:4)

我们可以尝试

 NA^(is.na(e))*d

如果记忆存在问题

d[] <- Map(function(x,y) NA^(is.na(y))* x, d, e)

答案 1 :(得分:2)

基于您的方法的另一种方式,

d[which(is.na(e), arr.ind = T)] <- NA