将具有伪变量的数据框转换为分类变量

时间:2018-09-03 03:26:21

标签: r categorical-data dummy-variable

我需要将虚拟变量转换为分类变量。对R陌生,我只是反过来知道如何去做。有人可以指出我正确的方向吗?

数据框为:

undefined reference to 'SDL_RenderPresent'

Input Dataframe

,预期输出为:

output dataframe

3 个答案:

答案 0 :(得分:4)

通过忽略第一列(lapply),使用id的一个选项,我们检查其中哪些列的值为1,并将其替换为对应的列名,其他可以更改为{{1 }}。

NA

不使用data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA)) data # id red blue yellow #1 1 red blue <NA> #2 2 <NA> blue <NA> #3 3 red blue <NA> #4 4 <NA> blue <NA> #5 5 red <NA> <NA> #6 6 <NA> blue <NA> #7 7 <NA> blue <NA> #8 8 <NA> blue yellow #9 9 <NA> <NA> yellow 循环的另一种方法

lapply

答案 1 :(得分:1)

由于“红色”,“蓝色”,“黄色”列为factor,因此我们将其强制为numeric,并使用索引将Map中的相应列名替换为

data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)], 
                  data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
#  id  c1   c2     c3
#1  1  red blue   None
#2  2 None blue   None
#3  3  red blue   None
#4  4 None blue   None
#5  5  red None   None
#6  6 None blue   None
#7  7 None blue   None
#8  8 None blue yellow
#9  9 None None yellow

或通过更改levels

的另一种选择
data[-1] <-  Map(function(x, y) {levels(x) <- c('None', y)
                    x},data[-1], names(data)[-1])

或者使用lapply,我们遍历列的顺序,提取列,将其更改为numeric,然后使用索引将值更改为列名和“无”

data[-1] <- lapply(seq_along(data[-1]), function(i) 
         c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )

注意:提供预期的输出。


或者使用矢量化方法,我们创建一个逻辑矩阵,乘以列索引,然后将索引更改为列名

data[-1] <-  `dim<-`(names(data)[-1][col(data[-1]) * 
                       (NA ^(data[-1] == 0))], dim(data[-1]))

或带有replace

的另一个选项
data[-1] <- replace(as.matrix(data[-1]), data[-1]==1, 
               rep(names(data)[-1], colSums(data[-1] == 1)))

或使用tidyverse

library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
         bind_cols(data[1], .) %>% 
         rename_at(2:4, ~ paste0("c", 1:3))
#  id   c1   c2     c3
#1  1  red blue   none
#2  2 none blue   none
#3  3  red blue   none
#4  4 none blue   none
#5  5  red none   none
#6  6 none blue   none
#7  7 none blue   none
#8  8 none blue yellow
#9  9 none none yellow

或与gather/spread

data %>% 
    gather(key, val, -id) %>% 
    mutate(val = case_when(val == 1 ~ key), 
    key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>% 
    spread(key, val)

基准

这是一些基准测试

data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
  Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

 })
#   user  system elapsed 
#  0.065   0.014   0.078 


system.time({
 `dim<-`(names(data1)[-1][col(data1[-1]) * 
                       (NA ^(data1[-1] == 0))], dim(data1[-1]))

})
# user  system elapsed 
#  0.387   0.036   0.422 

system.time({
   imap(data1[-1], ~ c('none', .y)[as.numeric(.x)]) 
})
# user  system elapsed 
# 0.047   0.006   0.054 


 system.time({
   lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
  }
  )
 # user  system elapsed 
 #  0.555   0.067   0.621 


system.time({
    ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)

  })
# user  system elapsed 
#  0.711   0.060   0.770 

在1e6数据集上

data1 <- data[rep(seq_len(nrow(data)), 1e6),] 
system.time({Map( function(x, y) {levels(x) <- c('None', y)
              x},data1[-1], names(data1)[-1])})
#   user  system elapsed 
#    0.123   0.016   0.139 

system.time({
   Map(function(x, y) c('None', y)[as.numeric(x)], 
                   data1[-1], names(data1)[-1])

  })
#   user  system elapsed 
#  0.328   0.074   0.402 
 system.time({
    lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
   }
   )
#   user  system elapsed 
#  7.125   0.463   7.561 

具有微基准测试

library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)], 
                 data1[-1], names(data1)[-1]), 
         ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
          rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr      min        lq      mean    median       uq      max nev 
#ak  6.14964  4.048205  2.401768  1.741373  2.47268  2.43698    10
#ak2  1.00000  1.000000  1.000000  1.000000  1.00000  1.00000    10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278    10

数据

data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L, 
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0", 
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")), 
 class = "data.frame", row.names = c(NA, 
-9L))

答案 2 :(得分:0)

这也许吗?

library(tidyverse)

data <- data %>%
  mutate(red = case_when(
    red == 1 ~ "Red", red == 0 ~ "None")) %>% 
  mutate(blue = case_when(
    blue == 1 ~ "Blue", blue == 0 ~ "None")) %>% 
  mutate(yellow = case_when(
    yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))

data