我有大量数据,如下所示:
Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y
exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT
exm-rs10134944 [A/G] TOP BOT NA NA NA
exm-rs10218696 NA NA NA [T/C] BOT TOP
exm-rs10223421 [A/C] TOP BOT NA NA NA
如何创建新列“SNP”,“ILMN.Strand”,“Customer.Strand”,其中:
if(SNP.x不等于SNP.y),且SNP.x为NA(缺失值),则新列中的值应取自“SNP.y”,“ILMN .Strand.y”, “Customer.Strand.y”
if(SNP.x不等于SNP.y),且SNP.y为NA(缺失值),则新列中的值应取自“SNP.x”,“ILMN” .Strand.x”, “Customer.Strand.x”
非常感谢提前! :)
答案 0 :(得分:0)
我假设,如果SNP.x
和SNP.y
都是NA
,则会从数据框中删除该行。如果SNP.x != SNP.y
行也被删除(如果发生这种情况)。
下面的代码不是很好或非常有效,但它应该可以解决问题。
tmp <- apply(df, 1, function(x){
# if SNP.x == SNP.y and not NA pass X
if(!is.na(x["SNP.x"] == x["SNP.y"])) {
if(x["SNP.x"] == x["SNP.y"]) data.frame(Name = x["Name"], SNP = x["SNP.x"], ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
} else if(is.na(x["SNP.x"])) { # else if SNP.x is NA pass y
if(!is.na(x["SNP.y"])) data.frame(Name = x["Name"], SNP = x["SNP.y"], ILMN.Strand = x["ILMN.Strand.y"], Customer.Strand = x["Customer.Strand.y"])
} else if(is.na(x["SNP.y"])) { # else if SNP.y is NA pass x
if(!is.na(x["SNP.x"])) data.frame(Name = x["Name"], SNP = x["SNP.x"], ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
} else NULL # otherwise pass NULL (e.g. (SNP.x != SNP.y AND neither are NA))
})
# rbind the list-output of the previous apply() function
result <- do.call(rbind, tmp[!sapply(tmp, is.null)])
结果是具有以下结构的数据框:
str(result)
'data.frame': 81 obs. of 4 variables:
$ Name : Factor w/ 81 levels "exm-rs666","exm-rs3510",..: 1 2 3 4 5 6 7 8 9 10 ...
$ SNP : Factor w/ 2 levels "[A/C]","[T/G]": 1 2 1 1 1 2 2 2 2 2 ...
$ ILMN.Strand : Factor w/ 2 levels "TOP","BOT": 1 1 1 2 1 1 2 2 1 1 ...
$ Customer.Strand: Factor w/ 2 levels "TOP","BOT": 1 1 1 2 1 2 1 1 1 1 ...
编辑:
这可能是更好的解决方案(R 3.2.4与dplyr 0.5.0),因为apply()
将数据帧强制转换为矩阵等。下面的解决方案也会返回唯一的“..”,如果{{ 1}}并且两者都不是(SNP.X != SNP.Y)
。希望这可以解决问题,尽管如果没有关于您的数据的更多信息,很难预测您可能遇到的问题。
在这个解决方案中,因素被强制转换为字符,因此请记住这一点。
NA
答案 1 :(得分:0)
data.table
怎么样?
我不确定逻辑应该如何工作(SNP.x != SNP.y
或NA's
两种情况,但您可以自行修改。
编辑:很少有方法进行基准测试。
require(data.table)
require(microbenchmark)
dat1 <- data.table(Name = c("exm-rs10128711", "exm-rs10134944", "exm-rs10218696", "exm-rs10223421", "both_NAs", "no_NAs_just_diff"),
SNP.x = c("[T/C]", "[A/G]", NA, "[A/C]", NA, "new_x"),
ILMN.Strand.x = c("BOT", "TOP", NA, "TOP", "new_x", "new_x"),
Customer.Strand.x = c("BOT", "BOT", NA, "BOT", "new_x", "new_x"),
SNP.y = c("[T/C]", NA, "[T/C]", NA, NA, "new_y"),
ILMN.Strand.y = c("BOT", NA, "BOT", NA, "new_y", "new_y"),
Customer.Strand.y = c("BOT", NA, "TOP", NA, "new_y", "new_y"))
# Make it a bit bigger
for (i in seq_len(15)) dat1 <- rbind(dat1, dat1) # 15 MB, 196608 rows
# If needed cast to characters (to get rid of "level sets of factors are different" error...)
# dat <- dat[, lapply(.SD, as.character)]
# if else returning a list
f1 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
if ( !is.na(SNP.x) ) { list(SNP.x, ILMN.Strand.x, Customer.Strand.x)
} else if ( !is.na(SNP.y) ) { list(SNP.y, ILMN.Strand.y, Customer.Strand.y)
} else { list(NA_character_, NA_character_, NA_character_) },
by = seq_len(nrow(dat1))
][]
}
# ifelse per column
f2 <- function() {
dat1[, ":="(SNP = ifelse(!is.na(SNP.x), SNP.x,
ifelse(!is.na(SNP.y), SNP.y, NA_character_)),
ILMN.Strand = ifelse(!is.na(SNP.x), ILMN.Strand.x,
ifelse(!is.na(SNP.y), ILMN.Strand.y, NA_character_)),
Customer.Strand = ifelse(!is.na(SNP.x), Customer.Strand.x,
ifelse(!is.na(SNP.y), Customer.Strand.y, NA_character_)))
][]
}
# ifelse returning a list
f3 <- function() {
dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
ifelse (!is.na(SNP.x), list(list(SNP.x, ILMN.Strand.x, Customer.Strand.x)),
ifelse (!is.na(SNP.y), list(list(SNP.y, ILMN.Strand.y, Customer.Strand.y)),
list(list(NA_character_, NA_character_, NA_character_))))[[1]] # HERE IS THE ONE!
][]
}
microbenchmark(
d1 <- f1(),
d2 <- f2(),
d3 <- f3(),
times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 303.03681 316.91054 354.9147 330.91177 403.3858 420.3286 5 b
# d2 <- f2() 658.27527 660.19131 723.9005 664.31352 737.0994 899.6230 5 c
# d3 <- f3() 78.20754 84.91487 110.3533 86.73539 104.9149 196.9938 5 a
d1[1:6, ]
# Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y SNP ILMN.Strand Customer.Strand
# 1: exm-rs10128711 [T/C] BOT BOT [T/C] BOT BOT [T/C] BOT BOT
# 2: exm-rs10134944 [A/G] TOP BOT NA NA NA [A/G] TOP BOT
# 3: exm-rs10218696 NA NA NA [T/C] BOT TOP [T/C] BOT TOP
# 4: exm-rs10223421 [A/C] TOP BOT NA NA NA [A/C] TOP BOT
# 5: both_NAs NA new_x new_x NA new_y new_y NA NA NA
# 6: no_NAs_just_diff new_x new_x new_x new_y new_y new_y new_x new_x new_x
sapply(list(d1, d2, d3), FUN = identical, d1)
# [1] TRUE TRUE TRUE
f2
只是因为我无法弄清楚如何从list
返回ifelse
,只是幸运的是我在{list
中使用了f3
这个想法{1}}。
要了解data.table
中的多个分配,请参阅例如: Assign multiple columns using := in data.table, by group
96行:
# Unit: microseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 1964.988 1968.936 2238.697 2273.276 2404.722 2581.564 5 b
# d2 <- f2() 976.574 998.284 1147.020 1033.021 1038.942 1688.280 5 a
# d3 <- f3() 684.471 845.916 1026.389 1141.573 1209.466 1250.519 5 a
6144行:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# d1 <- f1() 11.977032 12.128610 13.869310 12.52532 12.585317 20.130271 5 b
# d2 <- f2() 17.200552 17.627260 21.616209 20.76224 22.830254 29.660738 5 c
# d3 <- f3() 2.945114 3.009456 3.317191 3.04064 3.071429 4.519314 5 a