Question

我有一个已经排序的数据框，如下所示：

mydf <- data.frame(ID="A1", Level=c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"), Taxonomy=c("D__Eukaryota","K__Chloroplastida",NA,"C__Mamiellophyceae",NA,NA,"G__Crustomastix","S__Crustomastix sp. MBIC10709"), Letter=c("D","K","P","C","O","F","G","S"))

  ID   Level                      Taxonomy Letter
1 A1  domain                  D__Eukaryota      D
2 A1 kingdom             K__Chloroplastida      K
3 A1  phylum                          <NA>      P
4 A1   class            C__Mamiellophyceae      C
5 A1   order                          <NA>      O
6 A1  family                          <NA>      F
7 A1   genus               G__Crustomastix      G
8 A1 species S__Crustomastix sp. MBIC10709      S

我想要的是用最后一个非NA值替换NA值，在开头以滚动方式添加所有字母“miss”...看看我的意思如下。

目标是获得如下数据框：

  ID   Level                      Taxonomy Letter
1 A1  domain                  D__Eukaryota      D
2 A1 kingdom             K__Chloroplastida      K
3 A1  phylum          P__K__Chloroplastida      P
4 A1   class            C__Mamiellophyceae      C
5 A1   order         O__C__Mamiellophyceae      O
6 A1  family      F__O__C__Mamiellophyceae      F
7 A1   genus               G__Crustomastix      G
8 A1 species S__Crustomastix sp. MBIC10709      S

注意最后2个NA，最后一个如何携带前一个的值。看看两个中的第一个如何从O__C开始，最后一个用F__O__C开始。

到目前为止，我最好的尝试是以下（感谢Ajay Ohri）：

library(zoo)
mydf <- data.frame(ID="A1", Level=c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"), Taxonomy=c("D__Eukaryota","K__Chloroplastida",NA,"C__Mamiellophyceae",NA,NA,"G__Crustomastix","S__Crustomastix sp. MBIC10709"), Letter=c("D","K","P","C","O","F","G","S"))
mydf <- data.frame(lapply(mydf, as.character), stringsAsFactors=FALSE)
mydf$Letter2 <- ifelse(is.na(mydf$Taxonomy),paste(mydf$Letter,'__',sep=''),"")
mydf
mydf$Taxonomy <- paste(mydf$Letter2, na.locf(mydf$Taxonomy), sep='')
mydf

注意我仍然没有设法以滚动的方式进行（我得到F__C而不是最后一个NA的F__O__C）。有帮助吗？谢谢！

PS：让我知道它是否仍然令人困惑，所以我连续制作了另外一个具有更多NA的MWE，所以我需要的更明显。

Answer 1

由于OP已经提到内存消耗至关重要，这里采用data.table方法，使用na.locf()包中的zoo函数：

library(data.table)   # CRAN version 1.10.4 used
# coerce to data.table, convert factors to characters
DT <- data.table(mydf)[, lapply(.SD, as.character)]
# set marker for NA rows 
DT[, na := is.na(Taxonomy)][]
# fill NA by Last Observation Carried Forward
DT[, Taxonomy := zoo::na.locf(Taxonomy)][]
# create list of Letters and unique row count within each group of missing taxonomies
DT[(na), `:=`(tmp = .(Letter), rn = seq_len(.N)), by = .(ID, Taxonomy)][]
# replace incomplete taxonomies
DT[(na), Taxonomy := paste(c(rev(unlist(tmp)[1:rn]), Taxonomy), collapse = "__"), 
   by = .(ID, Taxonomy, rn)][]
# clean up
DT[, c("na", "tmp", "rn") := NULL][]

   ID   Level                      Taxonomy Letter
1: A1  domain                  D__Eukaryota      D
2: A1 kingdom             K__Chloroplastida      K
3: A1  phylum          P__K__Chloroplastida      P
4: A1   class            C__Mamiellophyceae      C
5: A1   order         O__C__Mamiellophyceae      O
6: A1  family      F__O__C__Mamiellophyceae      F
7: A1   genus               G__Crustomastix      G
8: A1 species S__Crustomastix sp. MBIC10709      S

我没有链接表达式，所以代码可以一步一步地执行。

请注意data.table正在更新到位而不复制整个数据集，这样可以节省内存和时间。

先决条件和其他解释

作为对this comment的回复，起始数据框已订购且非冗余的OP has confirmed以及 ID + Level应为唯一密钥数据框。

但是，由于上述解决方案取决于这些假设，因此值得添加一些检查：

# (1) ID + Level are unique keys: find duplicate Levels per ID
stopifnot(anyDuplicated(DT, by = c("ID", "Level")) == 0L)
# (2) rows missing: count rows per ID, there should be 8 Levels
DT[, .N, by = ID][, stopifnot(all(N == 8L))]
# (3) order, wrong Level names, and tests (1) and (2) as well
# create data.table with Level in proper order and a sequence number ln
levels <- data.table(
  ln = 1:8,
  Level = c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
)
# left inner join, i.e., keep only rows with matching Level, keep order of DT
# then check for consecutively ascending level sequence numbers
levels[DT, on = "Level", nomatch = 0][, stopifnot(all(diff(ln) == 1L)), by = ID]

此外，必须确保至少对于顶级Level＆＃34;域＆＃34;指定了Taxonomy。这可以通过以下方式进行双重检查：

# count number of rows with missing Taxonomy on top level "domain"
stopifnot(nrow(DT[Level == "domain" & is.na(Taxonomy)] == 0L))

分组逻辑by = .(ID, Taxonomy)与na上的选择一起使用，即DT[(na), ...，以便将其他字母添加到Taxonomy Taxonomy最初失踪了。在开发解决方案的过程中，我引入了一个额外的辅助列gn := rleid(ID, Taxonomy)，它将覆盖this comment中提到的重复项。最后，我认识到由于先决条件，我可以删除此列。

Answer 2

执行此操作的一种方法是将Reduce与accumulate = TRUE参数一起使用。即。

ind <- is.na(mydf$Taxonomy)
mydf$Taxonomy <- zoo::na.locf(mydf$Taxonomy)
mydf$Taxonomy[ind] <- paste0(with(mydf, ave(Level, Taxonomy, FUN = function(i) 
     Reduce(paste, toupper(substr(rev(i), 1, 1)), accumulate = TRUE)))[ind], '_', 
                                                      sub('.*_', '', mydf$Taxonomy[ind]))

mydf$Taxonomy <- gsub(' ', '_', mydf$Taxonomy)


mydf
#  ID   Level                      Taxonomy Letter
#1 A1  domain                  D__Eukaryota      D
#2 A1 kingdom             K__Chloroplastida      K
#3 A1  phylum            P_K_Chloroplastida      P
#4 A1   class            C__Mamiellophyceae      C
#5 A1   order           F_O_Mamiellophyceae      O
#6 A1  family         F_O_C_Mamiellophyceae      F
#7 A1   genus               G__Crustomastix      G
#8 A1 species S__Crustomastix_sp._MBIC10709      S

Answer 3

第1步

我首先要创建一个带有ifelse的列

data$colnew=ifelse(is.na(data$Taxonomy),"missed","")

如果您不打算粘贴错过的单词，可以跳过此步骤

第2步取最后一个值

来自Replacing NAs with latest non-NA value（请参阅此处的其他方法）

使用zoo包中的na.locf（）函数进行最后一次观察以替换你的NA值

或 新功能

repeat_last = function(x, forward = TRUE, maxgap = Inf, na.rm = FALSE) {
    if (!forward) x = rev(x)           # reverse x twice if carrying backward
    ind = which(!is.na(x))             # get positions of nonmissing values
    if (is.na(x[1]) && !na.rm)         # if it begins with NA
        ind = c(1,ind)                 # add first pos
    rep_times = diff(                  # diffing the indices + length yields how often
        c(ind, length(x) + 1) )          # they need to be repeated
    if (maxgap < Inf) {
        exceed = rep_times - 1 > maxgap  # exceeding maxgap
        if (any(exceed)) {               # any exceed?
            ind = sort(c(ind[exceed] + 1, ind))      # add NA in gaps
            rep_times = diff(c(ind, length(x) + 1) ) # diff again
        }
    }
    x = rep(x[ind], times = rep_times) # repeat the values at these indices
    if (!forward) x = rev(x)           # second reversion
    x
}

也可以在formr包中使用（仅限Github）。 https://github.com/rubenarslan/formr

步骤3

使用粘贴

将两列（newone）与df $ Letter连接成第三列

Answer 4

由于您提到了内存和性能问题，因此您已切换到已接受的data.table解决方案。

我正在添加另一个data.table变体，它不依赖于zoo等其他包，如果 Taxonomy 列不包含太长的{{}}序列，则可能足够快{1}}因为最长的序列决定了while循环的重复次数（例如，在示例数据的情况下为两个代表）：

NA

不幸的是library(data.table) mydf <- data.frame(ID="A1", Level=c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"), Taxonomy=c("D__Eukaryota","K__Chloroplastida",NA,"C__Mamiellophyceae",NA,NA,"G__Crustomastix","S__Crustomastix sp. MBIC10709"), Letter=c("D","K","P","C","O","F","G","S")) setDT(mydf) # Fill NA value in "Taxonomy" with the value of the prev. row until no NAs occur anymore prev.number.NAs <- 0 # required to stop the loop if no more NA values can be carried forward repeat { number.NAs <- sum(is.na(mydf$Taxonomy)) if( number.NAs == 0 | number.NAs == prev.number.NAs) break; mydf[, filler := shift(Taxonomy), by = .(ID)] # fill temporary working column with the value of the prev. row of the same group mydf[!is.na(filler) & is.na(Taxonomy), Taxonomy := paste0(Letter, "__", filler)] prev.number.NAs <- number.NAs } mydf[, filler := NULL] # remove working column mydf的{{1}}函数没有提供“最后一个观察结转”参数，所以我不得不使用shift循环。

更新1：正如@UweBlock在下面的评论中提到的，我已将data.table循环替换为while循环，以避免在{{while循环时出现无限循环1}}第一行中 Taxonomy 列中的值。 THX找到了这个！

更新2：向前推进最后一次观察现在只在同一组数据中完成（由 ID 列定义 - 作为评论中指示的OP）。感谢@UweBlock指出这个问题！

Answer 5

一种以NA开头的NA值填充的方法，并且还简化了处理组的逻辑：

forward_fill <- function (x) {
  if (length(x) == 0) return (vector(mode(x), 0))      

  xt  <- tail(x, -1)  
  x0  <- c(x[1], xt[!is.na(xt)])
  id0 <- c(TRUE,    !is.na(xt))
  y   <- x0[cumsum(id0)]
  
  return (y)
}

使用滚动方式填充具有最后非NA值的向量中的NA值加上另一个向量中的值

5 个答案:

先决条件和其他解释