我正在尝试创建一个如下所示的函数:
region <- function(State){
region_vector <-
ifelse(State %in% c("CT", "DE", "DC", "MD", "ME", "MD", "MA", "NH","NJ", "NY", "PA", "RI", "VT"), "Northeast",
ifelse(State %in% c("IN", "KY", "MI", "OH", "PA", "WV") "East Central",
ifelse(State %in% c("IN", "KY", "MI", "OH", "PA", "WV") "East Central",
ifelse(State %in% c("CO", "IL", "IA", "KS", "MN", "MO", "MT", "NE", "ND", "SD", "WI", "WY") "West Central",
ifelse(State %in% c("AL", "FL", "GA", "MS", "NC", "SC", "TN", "VA") "Southeast",
ifelse(State %in% c("AR", "LA", "NM", "OK", "TC") "Southwest",
ifelse(State %in% c("AK", "AZ", "CA", "HI", "ID", "NV", "OR", "UT", "WA") "Pacific",
"NA" )))))))
return(region_vector) }
我想将区域分配给我的数据中的列
c ("CA", "NY", "CO"...)
我上面编写的代码有什么问题?错误消息为unexpected string constant in c (...)
答案 0 :(得分:1)
只是为了告诉你其他方法,以及如何使用slooow ifelse
,例如:
# Lookup list
l <- list(
"Northeast" = c("CT", "DE", "DC", "MD", "ME", "MA", "NH","NJ", "NY", "RI", "VT"),
"East central" = c("IN", "KY", "MI", "OH", "PA", "WV"),
"West central" = c("CO", "IL", "IA", "KS", "MN", "MO", "MT", "NE", "ND", "SD", "WI", "WY"),
"Southeast" = c("AL", "FL", "GA", "MS", "NC", "SC", "TN", "VA"),
"Southwest" = c("AR", "LA", "NM", "OK", "TC"),
"Pacific" = c("AK", "AZ", "CA", "HI", "ID", "NV", "OR", "UT", "WA")
)
# long list
L <- unlist(l)
names(L) <- rep(names(l), times = lapply(l, length))
# data.frame
df <- data.frame(
CODE = unlist(l),
LABEL = rep(names(l), times = lapply(l, length)),
stringsAsFactors = FALSE, row.names = NULL
)
# Test data
set.seed(123)
test <- data.frame(CODE = sample(x = unlist(l), size = 1e4, replace = TRUE), stringsAsFactors = FALSE)
# Fun to recode with match
match_recode <- function(var, dico) {
names(dico)[match(x = var, table = dico)]
}
# With ifelse
region <- function(State){
region_vector <-
ifelse(State %in% c("CT", "DE", "DC", "MD", "ME", "MA", "NH","NJ", "NY", "RI", "VT"), "Northeast",
ifelse(State %in% c("IN", "KY", "MI", "OH", "PA", "WV"), "East Central",
ifelse(State %in% c("IN", "KY", "MI", "OH", "PA", "WV"), "East Central",
ifelse(State %in% c("CO", "IL", "IA", "KS", "MN", "MO", "MT", "NE", "ND", "SD", "WI", "WY"), "West Central",
ifelse(State %in% c("AL", "FL", "GA", "MS", "NC", "SC", "TN", "VA"), "Southeast",
ifelse(State %in% c("AR", "LA", "NM", "OK", "TC"), "Southwest",
ifelse(State %in% c("AK", "AZ", "CA", "HI", "ID", "NV", "OR", "UT", "WA"), "Pacific",
"NA" )))))))
return(region_vector)
}
# With data.table
dt_recode <- function(var, dico) {
dt <- data.table(CODE = var)
setkey(dt, CODE)
dt <- dt[dico]
return(dt$LABEL)
}
测试结果
table(match_recode(test$CODE, dico = L))
# East central Northeast Pacific Southeast Southwest West central
# 1211 2132 1711 1554 998 2394
table(region(test$CODE))
# East central Northeast Pacific Southeast Southwest West central
# 1211 2132 1711 1554 998 2394
library("data.table")
table(dt_recode(test$CODE, dico = df))
# East central Northeast Pacific Southeast Southwest West central
# 1211 2132 1711 1554 998 2394
# All the same
基准:
library("microbenchmark")
microbenchmark(match_recode(test$CODE, dico = L),
region(test$CODE),
dt_recode(test$CODE, dico = df),
times = 100L)
# Unit: microseconds
# expr min lq mean median uq max neval
# match_recode(test$CODE, dico = L) 266.845 271.549 344.7044 288.2265 298.7035 1138.792 100
# region(test$CODE) 23454.496 24250.325 26391.6468 24637.9750 25257.4050 49958.884 100
# dt_recode(test$CODE, dico = df) 1133.233 1184.977 1355.1031 1364.3705 1445.8345 2116.794 100
match
比ifelse
快得多!
使用data.table
PS:在您的ifelse
中,MD
在东北部出现两次,PA
在东北部和东部中部