我的数据集如下所示: 我的目标是通过组合当前变量来创建许多属性。例如,我想创建一个名为 HS_POC 的属性,当我们在数据集中拥有相同应用程序且应用程序状态为H或S并且邮政编码完全相同,否则为NO时,此属性显示Yes。这样做的代码是:
sub <-subset(
data,
status %in% c("H","S"),
select = c(
status,
applications,
postalcode))
attributes <-data.frame(setDT(sub)[, .(.N, applications) , by =
.(
status,
postalcode
)][N > 1])
attributes <- subset(attributes,
select = c(applications, N))
attributes$N <- "Yes"
colnames(attributes)[colnames(attributes) == "N"] <- "HS_POC"
train <-
merge(data, attributes, by = "applications", all.x =
TRUE)
data$HS_POC <- data$HS_POC %>% replace_na("No")
结果是像这样的新列
。
我想做的是创建许多列,例如 HS_POC 和其他变量的组合,例如 HS_生日, HS_产品类型, HS_名字,然后...我为此创建了一个循环,但是它很慢,如何从速度上改善它。
library(data.table)
data<-read.csv("test.csv")
Indep.var<-subset(data, select=-c(applications,Status))
aa<- colnames(Indep.var)
table<- t(combn(aa,2))
table<-data.frame(table)
table$Status<-c("Status")
table_status<- table
table_status<- as.matrix(table_status)
table$applications<-c("applications")
table_final<- as.matrix(table)
table_final[1,]
##paste(table_final[1,1],table_final[1,2],sep = "_")
##nrow(table_final)
N<-nrow(table_final)
for (i in 1:N)
{
sub <- subset(data,
Status %in% c("H", "S"),
select = table_final[i, ])
attributes <- data.frame(setDT(sub)[, .(.N, applications) , by =
eval(table_status[i,])][N > 1])
attributes <- subset(attributes,
select = c(applications, N))
if (nrow(attributes)== 0){
colnames(attributes)[colnames(attributes) == "N"] <- paste("HS", table_final[i, 1], table_final[i, 2], sep = "_")
data <- merge(data, attributes, by = "applications", all.x =
TRUE)
}else{
attributes$N <- "Yes"
colnames(attributes)[colnames(attributes) == "N"] <- paste("HS", table_final[i, 1], table_final[i, 2], sep = "_")
data <- merge(data, attributes, by = "applications", all.x =
TRUE)
}
}
library(tidyr)
data %>% replace_na("No")
data[is.na(data)] <- "No"
预先感谢您的帮助。