以递归方式引用数据帧

时间:2012-11-28 22:16:50

标签: r subset

有没有办法让数据框引用自身?

我发现自己花了很多时间编写像y$Category1[is.na(y$Category1)]<-NULL这样的东西,这些东西难以阅读,并且感觉很像重复的慢速打字。我想知道是否有某些内容:

y$Category1[is.na(self)] <- NULL我可以改用。

由于

1 个答案:

答案 0 :(得分:7)

这是一个很棒的问题。不幸的是,正如@ user295691在内容中指出的那样,问题在于引用向量两次:一次作为被索引的对象,一次作为条件的主题。似乎不可能避免双重引用。

numericVector[cond(numericVector)] <- newVal

我认为我们能做的是有一个漂亮而整洁的功能,而不是

 # this  
 y$Category1[is.na(y$Category1)] <- list(NULL)

 # we can have this: 
 NAtoNULL(y$Category1)

例如,以下函数包装selfAssign()(下面):

NAtoNULL(obj)      # Replaces NA values in obj with NULL.
NAtoVal(obj, val)  # Replaces NA values in obj with val.
selfReplace(obj, toReplace, val)  # Replaces toReplace values in obj with val

# and selfAssign can be called directly, but I'm not sure there would be a good reason to
selfAssign(obj, ind, val)  # equivalent to obj[ind] <- val

示例:

# sample df
df <- structure(list(subj=c("A",NA,"C","D","E",NA,"G"),temp=c(111L,112L,NA,114L,115L,116L,NA),size=c(0.7133,NA,0.7457,NA,0.0487,NA,0.8481)),.Names=c("subj","temp","size"),row.names=c(NA,-7L),class="data.frame")

df
  subj temp   size
1    A  111 0.7133
2 <NA>  112     NA
3    C   NA 0.7457
4    D  114     NA
5    E  115 0.0487
6 <NA>  116     NA
7    G   NA 0.8481

# Make some replacements
NAtoNULL(df$size)    # Replace all NA's in df$size wtih NULL's
NAtoVal(df$temp, 0)  # Replace all NA's in df$tmp wtih 0's
NAtoVal(df$subj, c("B", "E"))   # Replace all NA's in df$subj with alternating "B" and "E" 

# the modified df is now:  
df

  subj temp   size
1    A  111 0.7133
2    B  112   NULL
3    C    0 0.7457
4    D  114   NULL
5    E  115 0.0487
6    E  116   NULL
7    G    0 0.8481


# replace the 0's in temp for NA
selfReplace(df$temp, 0, NA)

# replace NULL's in size for 1's
selfReplace(df$size, NULL, 1)

# replace all "E"'s in subj with alternate c("E", "F")
selfReplace(df$subj, c("E"), c("E", "F"))

df

  subj temp   size
1    A  111 0.7133
2    B  112      1
3    C   NA 0.7457
4    D  114      1
5    E  115 0.0487
6    F  116      1
7    G   NA 0.8481

现在这适用于矢量,但会因* apply而失败。我很乐意让它完全运转,特别是应用plyr。关键是要修改

<小时/>

功能

功能代码如下。

重点。这不是(还是!)与* apply / plyr一起使用。
我相信它可以通过修改n的值并调整sys.parent(.)中的match.call(),但它仍然需要一些摆弄。 任何建议/修改都会非常感激

selfAssign <- function(self, ind, val, n=1, silent=FALSE) {
## assigns val to self[ind] in environment parent.frame(n)
## self should be a vector.  Currently will not work for matricies or data frames

  ## GRAB THE CORRECT MATCH CALL
  #--------------------------------------
      # if nested function, match.call appropriately
      if (class(match.call()) == "call") {
        mc <- (match.call(call=sys.call(sys.parent(1))))
      } else {
        mc <- match.call()
      }

      # needed in case self is complex (ie df$name)
      mc2 <- paste(as.expression(mc[[2]]))


  ## CLEAN UP ARGUMENT VALUES
  #--------------------------------------
      # replace logical indecies with numeric indecies
      if (is.logical(ind))
        ind <- which(ind) 

      # if no indecies will be selected, stop here
      if(identical(ind, integer(0)) || is.null(ind)) {
        if(!silent) warning("No indecies selected")
        return()
      }

      # if val is a string, we need to wrap it in quotes
      if (is.character(val))
        val <- paste('"', val, '"', sep="")

      # val cannot directly be NULL, must be list(NULL)
      if(is.null(val))
        val <- "list(NULL)"


  ## CREATE EXPRESSIONS AND EVAL THEM
  #--------------------------------------
     # create expressions to evaluate
     ret <- paste0("'[['(", mc2, ", ", ind, ") <- ", val)

     # evaluate in parent.frame(n)
     eval(parse(text=ret), envir=parent.frame(n))
}


NAtoNULL <- function(obj, n=1) {
  selfAssign(match.call()[[2]], is.na(obj), NULL, n=n+1)
}

NAtoVal <- function(obj, val, n=1) {
  selfAssign(match.call()[[2]], is.na(obj), val, n=n+1)  
}

selfReplace <- function(obj, toReplace, val, n=1) {
## replaces occurrences of toReplace within obj with val

  # determine ind based on value & length of toReplace
  # TODO:  this will not work properly for data frames, but neither will selfAssign, yet.
  if (is.null(toReplace)) {
    ind <- sapply(obj, function(x) is.null(x[[1]]))
  }  else if (is.na(toReplace)) {
    ind <- is.na(obj)
  } else  {
    if (length(obj) > 1) {    # note, this wont work for data frames
          ind <- obj %in% toReplace
    } else {
      ind <- obj == toReplace
    }
  } 

  selfAssign(match.call()[[2]], ind, val, n=n+1)  
}



  ## THIS SHOULD GO INSIDE NAtoNULL, NAtoVal etc. 

  # todo: modify for use with *apply
  if(substr(paste(as.expression(x1)), 1, 10) == "FUN(obj = ") {
      # PASS.  This should identify when the call is coming from *apply. 
      #  in such a case, need to increase n by 1 for apply & lapply.  Increase n by 2 for sapply      
      # I'm not sure the increase required for plyr functions
  }