在dplyr中改变虚拟变量

时间:2018-03-14 11:42:25

标签: r dplyr dummy-variable

我想使用dplyr

创建7个虚拟变量 - 每天一个

到目前为止,我已经设法使用sjmisc包和to_dummy函数来完成它,但是我分两步执行-1。创建一个df的dummies,2)追加到原来的df

#Sample dataframe
mydfdata.frame(x=rep(letters[1:9]),
           day=c("Mon","Tues","Wed","Thurs","Fri","Sat","Sun","Fri","Mon"))

#1.Create the 7 dummy variables separately
daysdummy<-sjmisc::to_dummy(mydf$day,suffix="label")

#2. append to dataframe
mydf<-bind_cols(mydf,daysdummy)


> mydf
  x   day day_Fri day_Mon day_Sat day_Sun day_Thurs day_Tues day_Wed
1 a   Mon       0       1       0       0         0        0       0
2 b  Tues       0       0       0       0         0        1       0
3 c   Wed       0       0       0       0         0        0       1
4 d Thurs       0       0       0       0         1        0       0
5 e   Fri       1       0       0       0         0        0       0
6 f   Sat       0       0       1       0         0        0       0
7 g   Sun       0       0       0       1         0        0       0
8 h   Fri       1       0       0       0         0        0       0
9 i   Mon       0       1       0       0         0        0       0

我的问题是,我是否可以使用dplyr在单个工作流程中执行此操作,并将to_dummy添加到管道工作流程中 - 可能使用mutate

* to_dummy documentation

3 个答案:

答案 0 :(得分:5)

如果您想对管道执行此操作,您可以执行以下操作:

library(dplyr)
library(sjmisc)

mydf %>% 
  to_dummy(day, suffix = "label") %>% 
  bind_cols(mydf) %>% 
  select(x, day, everything())

返回:

# A tibble: 9 x 9
  x     day   day_Fri day_Mon day_Sat day_Sun day_Thurs day_Tues day_Wed
  <fct> <fct>   <dbl>   <dbl>   <dbl>   <dbl>     <dbl>    <dbl>   <dbl>
1 a     Mon        0.      1.      0.      0.        0.       0.      0.
2 b     Tues       0.      0.      0.      0.        0.       1.      0.
3 c     Wed        0.      0.      0.      0.        0.       0.      1.
4 d     Thurs      0.      0.      0.      0.        1.       0.      0.
5 e     Fri        1.      0.      0.      0.        0.       0.      0.
6 f     Sat        0.      0.      1.      0.        0.       0.      0.
7 g     Sun        0.      0.      0.      1.        0.       0.      0.
8 h     Fri        1.      0.      0.      0.        0.       0.      0.
9 i     Mon        0.      1.      0.      0.        0.       0.      0.

我们可以使用dplyrtidyr

library(dplyr)
library(tidyr)

mydf %>% 
  mutate(var = 1) %>% 
  spread(day, var, fill = 0, sep = "_") %>% 
  left_join(mydf) %>% 
  select(x, day, everything())

使用基数R我们可以做类似的事情:

as.data.frame.matrix(table(rep(mydf$x, lengths(mydf$day)), unlist(mydf$day)))

返回:

  Fri Mon Sat Sun Thurs Tues Wed
a   0   1   0   0     0    0   0
b   0   0   0   0     0    1   0
c   0   0   0   0     0    0   1
d   0   0   0   0     1    0   0
e   1   0   0   0     0    0   0
f   0   0   1   0     0    0   0
g   0   0   0   1     0    0   0
h   1   0   0   0     0    0   0
i   0   1   0   0     0    0   0

答案 1 :(得分:0)

使用dummies()的替代解决方案,我认为会更快

mydf = data.frame(x=rep(letters[1:9]),
               day=c("Mon","Tues","Wed","Thurs","Fri","Sat","Sun","Fri","Mon"))


library(dummies)

mydf <- cbind(mydf, dummy(mydf$day, sep = "_"))

产生

x   day mydf_Fri mydf_Mon mydf_Sat mydf_Sun mydf_Thurs mydf_Tues mydf_Wed
1 a   Mon        0        1        0        0          0         0        0
2 b  Tues        0        0        0        0          0         1        0
3 c   Wed        0        0        0        0          0         0        1
4 d Thurs        0        0        0        0          1         0        0
5 e   Fri        1        0        0        0          0         0        0
6 f   Sat        0        0        1        0          0         0        0
7 g   Sun        0        0        0        1          0         0        0
8 h   Fri        1        0        0        0          0         0        0
9 i   Mon        0        1        0        0          0         0        0

然后您可以使用gsub()来获得更清晰的名称

names(mydf) = gsub("mydf_", "", names(mydf))
head(mydf)
  x   day Fri Mon Sat Sun Thurs Tues Wed
1 a   Mon   0   1   0   0     0    0   0
2 b  Tues   0   0   0   0     0    1   0
3 c   Wed   0   0   0   0     0    0   1
4 d Thurs   0   0   0   0     1    0   0
5 e   Fri   1   0   0   0     0    0   0
6 f   Sat   0   0   1   0     0    0   0

答案 2 :(得分:0)

您也可以使用基础R sjmisc::to_dummy代替model.matrix;一个dplyr解决方案是:

library(dplyr);
model.matrix(~ 0 + day, mydf) %>%
    as.data.frame() %>%
    bind_cols(mydf) %>%
    select(x, day, everything());
#  x   day dayFri dayMon daySat daySun dayThurs dayTues dayWed
#1 a   Mon      0      1      0      0        0       0      0
#2 b  Tues      0      0      0      0        0       1      0
#3 c   Wed      0      0      0      0        0       0      1
#4 d Thurs      0      0      0      0        1       0      0
#5 e   Fri      1      0      0      0        0       0      0
#6 f   Sat      0      0      1      0        0       0      0
#7 g   Sun      0      0      0      1        0       0      0
#8 h   Fri      1      0      0      0        0       0      0
#9 i   Mon      0      1      0      0        0       0      0