使用ifelse将NA映射到未知

时间:2019-02-12 10:20:31

标签: r

我正在使用:

raw <- c('0', '13', 'NULL')

data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))

data

data$category <- ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++",
ifelse(is.na(data$number) == TRUE, "unknown",
"unknown")))))))

data

人们会认为数字==“ NULL”条目已更改为“ unknown”,但我得到了:

   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA       <NA>

代替:

   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA       unknown

有人可以告诉我为什么吗?当前的解决方法是运行:

data$category[which(is.na(data$number))] = "unknown"

在上面的代码块之后。

6 个答案:

答案 0 :(得分:4)

我不知道这是否是您想要的方式,但是通过使用cut()函数,可以确保它的冗长代码更少。

data$category <- cut(data$number, 
    breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
    labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++")) 
    # labels for each category

不幸的是,您需要这两行将NA转换为"Unknown"

levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"
data
#     raw number category
# 1    0      0        0
# 2   13     13  [8..14]
# 3 NULL     NA  Unknown

数据:

raw <- c('0', '13', 'NULL')

data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))

基准化:

microbenchmark::microbenchmark(
  #cut
  cut = {data$category <- cut(data$number, 
                              breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), 
                              labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
  levels(data$category) <- c(levels(data$category), "Unknown")
  data$category[is.na(data$number)] <- "Unknown"},
  #findInt
  findInt = {vec<-c(0,7,14,31,62)
  levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
            paste0(vec[length(vec)]+1,"++"))
  res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
  res[is.na(res)]<-"unknown"},
  # lapply
  lapply = {data$category <- lapply(data$number,function(x) {
    if(is.na(x) || is.null(x)) "unknown"
    else if(x == 0) "0"
    else if(x > 0 & x <= 7) "[1 ... 7]"
    else if(x > 7 & x <= 14) "[8 ... 14]"
    else if(x > 14 & x <= 31) "[15 ... 31]"
    else if(x > 31 & x <= 62) "[32 ... 62]"
    else if(x > 62) "63++"
    else "unknown"
  })},
  # ifelse
  ifelse = {data$category <- 
    ifelse(is.na(data$number), "unknown", 
           ifelse(data$number == 0, "0",
                  ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
                         ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
                                ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
                                       ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
                                              ifelse(data$number > 62, "63++", "???")))))))}
                               )

礼物:

# Unit: microseconds
#    expr     min       lq       mean   median       uq        max neval
#     cut 132.207 139.4185  154.78149 144.9770 154.5925    283.043   100
# findInt  18.329  21.7850   26.58004  26.2915  28.8460     60.996   100
#  lapply  14.122  15.6250 4269.73574  17.2770  18.7800 425198.055   100
#  ifelse  81.728  84.8835   96.09675  88.9400  96.3010    193.503   100

答案 1 :(得分:1)

也许您更愿意将条件分配与within()一起使用,这更加清楚。

data <- within(data, {
  category <- NA
  category[number == 0] <- 0
  category[number > 0 & number <= 7] <- "[1 ... 7]"
  category[number > 7 & number <= 14] <- "[8 ... 14]"
  category[number > 14 & number <= 31] <- "[15 ... 31]"
  category[number > 31 & number <= 62] <- "[32 ... 62]"
  category[number > 62] <- "[32 ... 62]"
  category[is.na(number)] <- "unknown"
})

> data
   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA    unknown

答案 2 :(得分:1)

我不知道这对您是否有用,可以继续使用当前的方法:将data重命名为df,因为有一个名为data的函数

df$category[is.na(df$category)]<-"Unknown"
df$category

答案 3 :(得分:1)

如果将is.na()移到开头,则当前代码将起作用:

data$category <- 
  ifelse(is.na(data$number), "unknown", 
    ifelse(data$number == 0, "0",
      ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
        ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
          ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
            ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
              ifelse(data$number > 62, "63++", "???")))))))

> data
   raw number   category
1    0      0          0
2   13     13 [8 ... 14]
3 NULL     NA    unknown

答案 4 :(得分:1)

我改写为使用lapply而不是嵌套的ifelse函数调用,使其更加友好。值得注意的变化是首先测试is.na()并返回“ unknown”,而不是最后进行默认检查。原因是,当将NA传递给函数时,您的第一个测试是if(NA == 0),但这将返回NA且既不是TRUE也不是FALSE,这就是为什么它是NA并且没有完成其检查以返回“未知”的原因

data$category <- lapply(data$number,function(x) {
  if(is.na(x) || is.null(x)) "unknown"
  else if(x == 0) "0"
  else if(x > 0 & x <= 7) "[1 ... 7]"
  else if(x > 7 & x <= 14) "[8 ... 14]"
  else if(x > 14 & x <= 31) "[15 ... 31]"
  else if(x > 31 & x <= 62) "[32 ... 62]"
  else if(x > 62) "63++"
  else "unknown"
})

答案 5 :(得分:1)

尝试一下:

#define a vector with the range values
vec<-c(0,7,14,31,62)
#create your labels
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
                paste0(vec[length(vec)]+1,"++"))
#use findInterval to create your result
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
#substitute the NA's
res[is.na(res)]<-"unknown"