我正在使用:
raw <- c('0', '13', 'NULL')
data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))
data
data$category <- ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++",
ifelse(is.na(data$number) == TRUE, "unknown",
"unknown")))))))
data
人们会认为数字==“ NULL”条目已更改为“ unknown”,但我得到了:
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA <NA>
代替:
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
有人可以告诉我为什么吗?当前的解决方法是运行:
data$category[which(is.na(data$number))] = "unknown"
在上面的代码块之后。
答案 0 :(得分:4)
我不知道这是否是您想要的方式,但是通过使用cut()
函数,可以确保它的冗长代码更少。
data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
# labels for each category
不幸的是,您需要这两行将NA
转换为"Unknown"
:
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"
data
# raw number category
# 1 0 0 0
# 2 13 13 [8..14]
# 3 NULL NA Unknown
数据:
raw <- c('0', '13', 'NULL')
data <- data.frame(raw)
data$number <- as.numeric(as.character(data$raw))
基准化:
microbenchmark::microbenchmark(
#cut
cut = {data$category <- cut(data$number,
breaks=c(-Inf, 0, 7, 14, 31, 62, Inf),
labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
levels(data$category) <- c(levels(data$category), "Unknown")
data$category[is.na(data$number)] <- "Unknown"},
#findInt
findInt = {vec<-c(0,7,14,31,62)
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
paste0(vec[length(vec)]+1,"++"))
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
res[is.na(res)]<-"unknown"},
# lapply
lapply = {data$category <- lapply(data$number,function(x) {
if(is.na(x) || is.null(x)) "unknown"
else if(x == 0) "0"
else if(x > 0 & x <= 7) "[1 ... 7]"
else if(x > 7 & x <= 14) "[8 ... 14]"
else if(x > 14 & x <= 31) "[15 ... 31]"
else if(x > 31 & x <= 62) "[32 ... 62]"
else if(x > 62) "63++"
else "unknown"
})},
# ifelse
ifelse = {data$category <-
ifelse(is.na(data$number), "unknown",
ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++", "???")))))))}
)
礼物:
# Unit: microseconds
# expr min lq mean median uq max neval
# cut 132.207 139.4185 154.78149 144.9770 154.5925 283.043 100
# findInt 18.329 21.7850 26.58004 26.2915 28.8460 60.996 100
# lapply 14.122 15.6250 4269.73574 17.2770 18.7800 425198.055 100
# ifelse 81.728 84.8835 96.09675 88.9400 96.3010 193.503 100
答案 1 :(得分:1)
也许您更愿意将条件分配与within()
一起使用,这更加清楚。
data <- within(data, {
category <- NA
category[number == 0] <- 0
category[number > 0 & number <= 7] <- "[1 ... 7]"
category[number > 7 & number <= 14] <- "[8 ... 14]"
category[number > 14 & number <= 31] <- "[15 ... 31]"
category[number > 31 & number <= 62] <- "[32 ... 62]"
category[number > 62] <- "[32 ... 62]"
category[is.na(number)] <- "unknown"
})
> data
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
答案 2 :(得分:1)
我不知道这对您是否有用,可以继续使用当前的方法:将data
重命名为df
,因为有一个名为data
的函数
df$category[is.na(df$category)]<-"Unknown"
df$category
答案 3 :(得分:1)
如果将is.na()
移到开头,则当前代码将起作用:
data$category <-
ifelse(is.na(data$number), "unknown",
ifelse(data$number == 0, "0",
ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
ifelse(data$number > 62, "63++", "???")))))))
> data
raw number category
1 0 0 0
2 13 13 [8 ... 14]
3 NULL NA unknown
答案 4 :(得分:1)
我改写为使用lapply而不是嵌套的ifelse函数调用,使其更加友好。值得注意的变化是首先测试is.na()并返回“ unknown”,而不是最后进行默认检查。原因是,当将NA传递给函数时,您的第一个测试是if(NA == 0),但这将返回NA且既不是TRUE也不是FALSE,这就是为什么它是NA并且没有完成其检查以返回“未知”的原因>
data$category <- lapply(data$number,function(x) {
if(is.na(x) || is.null(x)) "unknown"
else if(x == 0) "0"
else if(x > 0 & x <= 7) "[1 ... 7]"
else if(x > 7 & x <= 14) "[8 ... 14]"
else if(x > 14 & x <= 31) "[15 ... 31]"
else if(x > 31 & x <= 62) "[32 ... 62]"
else if(x > 62) "63++"
else "unknown"
})
答案 5 :(得分:1)
尝试一下:
#define a vector with the range values
vec<-c(0,7,14,31,62)
#create your labels
levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
paste0(vec[length(vec)]+1,"++"))
#use findInterval to create your result
res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
#substitute the NA's
res[is.na(res)]<-"unknown"