我需要将虚拟变量转换为分类变量。对R陌生,我只是反过来知道如何去做。有人可以指出我正确的方向吗?
数据框为:
undefined reference to 'SDL_RenderPresent'
,预期输出为:
答案 0 :(得分:4)
通过忽略第一列(lapply
),使用id
的一个选项,我们检查其中哪些列的值为1,并将其替换为对应的列名,其他可以更改为{{1 }}。
NA
不使用data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA))
data
# id red blue yellow
#1 1 red blue <NA>
#2 2 <NA> blue <NA>
#3 3 red blue <NA>
#4 4 <NA> blue <NA>
#5 5 red <NA> <NA>
#6 6 <NA> blue <NA>
#7 7 <NA> blue <NA>
#8 8 <NA> blue yellow
#9 9 <NA> <NA> yellow
循环的另一种方法
lapply
答案 1 :(得分:1)
由于“红色”,“蓝色”,“黄色”列为factor
,因此我们将其强制为numeric
,并使用索引将Map
中的相应列名替换为
data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)],
data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
# id c1 c2 c3
#1 1 red blue None
#2 2 None blue None
#3 3 red blue None
#4 4 None blue None
#5 5 red None None
#6 6 None blue None
#7 7 None blue None
#8 8 None blue yellow
#9 9 None None yellow
或通过更改levels
data[-1] <- Map(function(x, y) {levels(x) <- c('None', y)
x},data[-1], names(data)[-1])
或者使用lapply
,我们遍历列的顺序,提取列,将其更改为numeric
,然后使用索引将值更改为列名和“无”
data[-1] <- lapply(seq_along(data[-1]), function(i)
c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )
注意:提供预期的输出。
或者使用矢量化方法,我们创建一个逻辑矩阵,乘以列索引,然后将索引更改为列名
data[-1] <- `dim<-`(names(data)[-1][col(data[-1]) *
(NA ^(data[-1] == 0))], dim(data[-1]))
或带有replace
data[-1] <- replace(as.matrix(data[-1]), data[-1]==1,
rep(names(data)[-1], colSums(data[-1] == 1)))
或使用tidyverse
library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
bind_cols(data[1], .) %>%
rename_at(2:4, ~ paste0("c", 1:3))
# id c1 c2 c3
#1 1 red blue none
#2 2 none blue none
#3 3 red blue none
#4 4 none blue none
#5 5 red none none
#6 6 none blue none
#7 7 none blue none
#8 8 none blue yellow
#9 9 none none yellow
或与gather/spread
data %>%
gather(key, val, -id) %>%
mutate(val = case_when(val == 1 ~ key),
key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>%
spread(key, val)
这是一些基准测试
data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.065 0.014 0.078
system.time({
`dim<-`(names(data1)[-1][col(data1[-1]) *
(NA ^(data1[-1] == 0))], dim(data1[-1]))
})
# user system elapsed
# 0.387 0.036 0.422
system.time({
imap(data1[-1], ~ c('none', .y)[as.numeric(.x)])
})
# user system elapsed
# 0.047 0.006 0.054
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 0.555 0.067 0.621
system.time({
ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)
})
# user system elapsed
# 0.711 0.060 0.770
在1e6数据集上
data1 <- data[rep(seq_len(nrow(data)), 1e6),]
system.time({Map( function(x, y) {levels(x) <- c('None', y)
x},data1[-1], names(data1)[-1])})
# user system elapsed
# 0.123 0.016 0.139
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.328 0.074 0.402
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 7.125 0.463 7.561
具有微基准测试
library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1]),
ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr min lq mean median uq max nev
#ak 6.14964 4.048205 2.401768 1.741373 2.47268 2.43698 10
#ak2 1.00000 1.000000 1.000000 1.000000 1.00000 1.00000 10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278 10
data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")),
class = "data.frame", row.names = c(NA,
-9L))
答案 2 :(得分:0)
这也许吗?
library(tidyverse)
data <- data %>%
mutate(red = case_when(
red == 1 ~ "Red", red == 0 ~ "None")) %>%
mutate(blue = case_when(
blue == 1 ~ "Blue", blue == 0 ~ "None")) %>%
mutate(yellow = case_when(
yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))
data