每个机会ID都有几种产品 我想有一个二进制列,说明机会是否具有此产品。该怎么做?
输入
+---+---------------+--------+----------+----------+
| | Opportunityid | Level | Product1 | Product2 |
+---+---------------+--------+----------+----------+
| 1 | 10 | Low | SS | ISP |
| 2 | 20 | High | ISP | Azure |
| 3 | 30 | Normal | Azure | ISP |
| 4 | 40 | | SS | |
| 5 | 50 | | ISP | |
+---+---------------+--------+----------+----------+
预期产量(检查产品1和产品2)
+---+---------------+--------+----------+----------+--------+---------+-----------+
| | Opportunityid | Level | Product1 | Product2 | HasSS? | HasISP? | HasAzure? |
+---+---------------+--------+----------+----------+--------+---------+-----------+
| 1 | 10 | Low | SS | ISP | 1 | 1 | 0 |
| 2 | 20 | High | ISP | Azure | 0 | 1 | 1 |
| 3 | 30 | Normal | Azure | ISP | 0 | 1 | 1 |
| 4 | 40 | | SS | | 1 | | 0 |
| 5 | 50 | | ISP | | 0 | 1 | 0 |
+---+---------------+--------+----------+----------+--------+---------+-----------+
代码
library(caret)
Products <- data.frame(
Opportunityid=c(10, 20, 30, 40, 50),
Level=c('Low', 'High', 'Normal', '', ''),
Product1=c('SS', 'ISP', 'Azure', 'SS', 'ISP'),
Product2=c('ISP', 'Azure', 'ISP', '',''))
# dummify the data
dmy <- dummyVars(" ~ .", data = Products)
trsf <- data.frame(predict(dmy, newdata = Products))
trsf
PS:我有100多种产品,所以我希望流程自动化
答案 0 :(得分:1)
您可以使用tidyverse
清除数据:
library(tidyverse)
Products <- data.frame(
Opportunityid=c(10, 20, 30, 40, 50),
Level=c('Low', 'High', 'Normal', '', ''),
Product1=c('SS', 'ISP', 'Azure', 'SS', 'ISP'),
Product2=c('ISP', 'Azure', 'ISP', '',''),
stringsAsFactors = FALSE)
Products %>%
gather(key, value, Product1:Product2) %>% ## collect all Product columns
mutate(has = ifelse(value == '', '', 1)) %>% ## add a dummy variable
spread(value, has, fill = 0) %>% ## spread the values back in wider format
select(-key, -V1) %>% ## remove empty columns and former product column
group_by(Opportunityid, Level) %>% ## group by to collapse rows
summarise_at(vars(-(Opportunityid:Level)), funs(max)) ## collapse rows
# A tibble: 5 x 5
# Groups: Opportunityid [?]
# Opportunityid Level Azure ISP SS
# <dbl> <chr> <chr> <chr> <chr>
# 1 10 Low 0 1 1
# 2 20 High 1 1 0
# 3 30 Normal 1 1 0
# 4 40 "" 0 0 1
# 5 50 "" 0 1 0
答案 1 :(得分:0)
data.table方法,可以从其快速投射和合并功能中获利
Products <- data.frame(
Opportunityid=c(10, 20, 30, 40, 50),
Level=c('Low', 'High', 'Normal', '', ''),
Product1=c('SS', 'ISP', 'Azure', 'SS', 'ISP'),
Product2=c('ISP', 'Azure', 'ISP', '',''))
library( data.table )
#create the data.table
dt <- as.data.table( Products )
#first, melt all columns containing "Pruduct"
dt.melt <- melt(dt, id.vars = 1:2, measure.vars = grep( "Product" , names( dt ) ) )
#add a value of 1
dt.melt[, value2 := ifelse( value == "", NA, 1)]
#now cast
dt.cast <- dcast( dt.melt, Opportunityid ~ value, value.var = "value2")[, c("V1", "Opportunityid") := NULL]
#replace NA with 0
dt.cast[is.na(dt.cast)] <-0
#and bind
cbind(dt, dt.cast)
# Opportunityid Level Product1 Product2 Azure ISP SS
# 1: 10 Low SS ISP 0 1 1
# 2: 20 High ISP Azure 1 1 0
# 3: 30 Normal Azure ISP 1 1 0
# 4: 40 SS 0 0 1
# 5: 50 ISP 0 1 0
microbenchmark::microbenchmark( data.table = {
#first, melt all columns containing "Pruduct"
dt.melt <- melt(dt, id.vars = 1:2, measure.vars = grep( "Product" , names( dt ) ) )
#add a value of 1
dt.melt[, value2 := ifelse( value == "", NA, 1)]
#now cast
dt.cast <- dcast( dt.melt, Opportunityid ~ value, value.var = "value2")[, c("V1", "Opportunityid") := NULL]
#replace NA with 0
dt.cast[is.na(dt.cast)] <-0
#and bind
cbind(dt, dt.cast) },
dplyr = {
Products %>%
gather(key, value, Product1:Product2) %>% ## collect all Product columns
mutate(has = ifelse(value == '', '', 1)) %>% ## add a dummy variable
spread(value, has, fill = 0) %>% ## spread the values back in wider format
select(-key, -V1) %>% ## remove empty columns and former product column
group_by(Opportunityid, Level) %>% ## group by to collapse rows
summarise_at(vars(-(Opportunityid:Level)), funs(max)) ## collapse rows
},
times = 100)
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table 3.159354 3.395846 3.771977 3.598145 3.787187 13.68190 100
# dplyr 10.104990 10.451142 11.134228 10.694714 10.929098 29.83777 100