Question

我有大量数据，如下所示：

Name    SNP.x   ILMN.Strand.x   Customer.Strand.x   SNP.y   ILMN.Strand.y   Customer.Strand.y   
exm-rs10128711  [T/C]   BOT BOT [T/C]   BOT BOT
exm-rs10134944  [A/G]   TOP BOT NA  NA  NA  
exm-rs10218696    NA    NA  NA [T/C] BOT TOP
exm-rs10223421  [A/C]   TOP BOT NA  NA  NA

如何创建新列“SNP”，“ILMN.Strand”，“Customer.Strand”，其中：

if（SNP.x = SNP.y），则“SNP”，“ILMN.Strand”，“Customer.Strand”将来自“SNP.x”，“ILMN.Strand.x”，“Customer。 Strand.x“
if（SNP.x不等于SNP.y），且SNP.x为NA（缺失值），则新列中的值应取自“SNP.y”，“ILMN .Strand.y”， “Customer.Strand.y”
if（SNP.x不等于SNP.y），且SNP.y为NA（缺失值），则新列中的值应取自“SNP.x”，“ILMN” .Strand.x”， “Customer.Strand.x”

非常感谢提前！：）

Answer 1

我假设，如果SNP.x和SNP.y都是NA，则会从数据框中删除该行。如果SNP.x != SNP.y行也被删除（如果发生这种情况）。

下面的代码不是很好或非常有效，但它应该可以解决问题。

tmp <- apply(df, 1, function(x){
  # if SNP.x == SNP.y and not NA pass X
  if(!is.na(x["SNP.x"] == x["SNP.y"])) {
    if(x["SNP.x"] == x["SNP.y"]) data.frame(Name = x["Name"], SNP = x["SNP.x"],  ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
  } else if(is.na(x["SNP.x"])) { # else if SNP.x is NA pass y
    if(!is.na(x["SNP.y"])) data.frame(Name = x["Name"], SNP = x["SNP.y"],  ILMN.Strand = x["ILMN.Strand.y"], Customer.Strand = x["Customer.Strand.y"])
  } else if(is.na(x["SNP.y"])) { # else if SNP.y is NA pass x
    if(!is.na(x["SNP.x"])) data.frame(Name = x["Name"], SNP = x["SNP.x"],  ILMN.Strand = x["ILMN.Strand.x"], Customer.Strand = x["Customer.Strand.x"])
  } else NULL # otherwise pass NULL (e.g. (SNP.x != SNP.y AND neither are NA))
})

# rbind the list-output of the previous apply() function
result <- do.call(rbind, tmp[!sapply(tmp, is.null)])

结果是具有以下结构的数据框：

str(result)

'data.frame':   81 obs. of  4 variables:
 $ Name           : Factor w/ 81 levels "exm-rs666","exm-rs3510",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ SNP            : Factor w/ 2 levels "[A/C]","[T/G]": 1 2 1 1 1 2 2 2 2 2 ...
 $ ILMN.Strand    : Factor w/ 2 levels "TOP","BOT": 1 1 1 2 1 1 2 2 1 1 ...
 $ Customer.Strand: Factor w/ 2 levels "TOP","BOT": 1 1 1 2 1 2 1 1 1 1 ...

编辑：

这可能是更好的解决方案（R 3.2.4与dplyr 0.5.0），因为apply()将数据帧强制转换为矩阵等。下面的解决方案也会返回唯一的“..”，如果{{ 1}}并且两者都不是(SNP.X != SNP.Y)。希望这可以解决问题，尽管如果没有关于您的数据的更多信息，很难预测您可能遇到的问题。在这个解决方案中，因素被强制转换为字符，因此请记住这一点。

NA

Answer 2

data.table怎么样？

我不确定逻辑应该如何工作（SNP.x != SNP.y或NA's两种情况，但您可以自行修改。

编辑：很少有方法进行基准测试。

准备数据：

require(data.table)
require(microbenchmark)

dat1 <- data.table(Name = c("exm-rs10128711", "exm-rs10134944", "exm-rs10218696", "exm-rs10223421", "both_NAs", "no_NAs_just_diff"),
                   SNP.x = c("[T/C]", "[A/G]", NA, "[A/C]", NA, "new_x"),
                   ILMN.Strand.x = c("BOT", "TOP", NA, "TOP", "new_x", "new_x"),
                   Customer.Strand.x = c("BOT", "BOT", NA, "BOT", "new_x", "new_x"),
                   SNP.y = c("[T/C]", NA, "[T/C]", NA, NA, "new_y"),
                   ILMN.Strand.y = c("BOT", NA, "BOT", NA, "new_y", "new_y"),
                   Customer.Strand.y = c("BOT", NA, "TOP", NA, "new_y", "new_y"))

# Make it a bit bigger
for (i in seq_len(15)) dat1 <- rbind(dat1, dat1)  # 15 MB, 196608 rows

# If needed cast to characters (to get rid of "level sets of factors are different" error...)
# dat <- dat[, lapply(.SD, as.character)]

功能：

# if else returning a list
f1 <- function() {
  dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
         if        ( !is.na(SNP.x) ) { list(SNP.x, ILMN.Strand.x, Customer.Strand.x)
         } else if ( !is.na(SNP.y) ) { list(SNP.y, ILMN.Strand.y, Customer.Strand.y)
         } else                      { list(NA_character_, NA_character_, NA_character_) },
       by = seq_len(nrow(dat1))
       ][]
}

# ifelse per column
f2 <- function() {
  dat1[, ":="(SNP = ifelse(!is.na(SNP.x), SNP.x,
                           ifelse(!is.na(SNP.y), SNP.y, NA_character_)),
              ILMN.Strand = ifelse(!is.na(SNP.x), ILMN.Strand.x,
                                   ifelse(!is.na(SNP.y), ILMN.Strand.y, NA_character_)),
              Customer.Strand = ifelse(!is.na(SNP.x), Customer.Strand.x,
                                       ifelse(!is.na(SNP.y), Customer.Strand.y, NA_character_)))
       ][]
}

# ifelse returning a list
f3 <- function() {
  dat1[, c("SNP", "ILMN.Strand", "Customer.Strand") :=
         ifelse (!is.na(SNP.x), list(list(SNP.x, ILMN.Strand.x, Customer.Strand.x)),
                 ifelse (!is.na(SNP.y), list(list(SNP.y, ILMN.Strand.y, Customer.Strand.y)),
                                        list(list(NA_character_, NA_character_, NA_character_))))[[1]]  # HERE IS THE ONE!
       ][]
}

基准

microbenchmark(
  d1 <- f1(),
  d2 <- f2(),
  d3 <- f3(),
  times = 5)

# Unit: milliseconds
#        expr       min        lq     mean    median       uq      max neval cld
#  d1 <- f1() 303.03681 316.91054 354.9147 330.91177 403.3858 420.3286     5  b 
#  d2 <- f2() 658.27527 660.19131 723.9005 664.31352 737.0994 899.6230     5   c
#  d3 <- f3()  78.20754  84.91487 110.3533  86.73539 104.9149 196.9938     5 a  

d1[1:6, ]
#                Name SNP.x ILMN.Strand.x Customer.Strand.x SNP.y ILMN.Strand.y Customer.Strand.y   SNP ILMN.Strand Customer.Strand
# 1:   exm-rs10128711 [T/C]           BOT               BOT [T/C]           BOT               BOT [T/C]         BOT             BOT
# 2:   exm-rs10134944 [A/G]           TOP               BOT    NA            NA                NA [A/G]         TOP             BOT
# 3:   exm-rs10218696    NA            NA                NA [T/C]           BOT               TOP [T/C]         BOT             TOP
# 4:   exm-rs10223421 [A/C]           TOP               BOT    NA            NA                NA [A/C]         TOP             BOT
# 5:         both_NAs    NA         new_x             new_x    NA         new_y             new_y    NA          NA              NA
# 6: no_NAs_just_diff new_x         new_x             new_x new_y         new_y             new_y new_x       new_x           new_x

sapply(list(d1, d2, d3), FUN = identical, d1)
# [1] TRUE TRUE TRUE

小集

96行：

# Unit: microseconds
#        expr      min       lq     mean   median       uq      max neval cld
#  d1 <- f1() 1964.988 1968.936 2238.697 2273.276 2404.722 2581.564     5   b
#  d2 <- f2()  976.574  998.284 1147.020 1033.021 1038.942 1688.280     5  a 
#  d3 <- f3()  684.471  845.916 1026.389 1141.573 1209.466 1250.519     5  a

6144行：

# Unit: milliseconds
#        expr       min        lq      mean   median        uq       max neval cld
#  d1 <- f1() 11.977032 12.128610 13.869310 12.52532 12.585317 20.130271     5  b 
#  d2 <- f2() 17.200552 17.627260 21.616209 20.76224 22.830254 29.660738     5   c
#  d3 <- f3()  2.945114  3.009456  3.317191  3.04064  3.071429  4.519314     5 a

如何创建一个满足R中不同“IF”条件的新列？

2 个答案:

准备数据：

功能：

基准

评论

小集