我正在开发一个项目,我需要根据不同的属性组合重复对data.frame进行子集化。现在我正在使用merge函数对data.frame进行子集化,因为我不知道在运行时输入的属性是什么,这是有效的。但是,我想知道是否有更快的方法来创建子集。
require(data.table)
df <- structure(list(att1 = c("e", "a", "c", "a", "d", "e", "a", "d", "b", "a", "c", "a", "b", "e", "e", "c", "d", "d", "a", "e", "b"),
att2 = c("b", "d", "c", "a", "e", "c", "e", "d", "e", "b", "e", "e", "c", "e", "a", "a", "e", "c", "b", "b", "d"),
att3 = c("c", "b", "e", "b", "d", "d", "d", "c", "c", "d", "e", "a", "d", "c", "e", "a", "d", "e", "d", "a", "e"),
att4 = c("c", "a", "b", "a", "e", "c", "a", "a", "b", "a", "a", "e", "c", "d", "b", "e", "b", "d", "d", "b", "e")),
.Names = c("att1", "att2", "att3", "att4"), class = "data.frame", row.names = c(NA, -21L))
#create combinations of attributes
#attributes to search through
cnames <- colnames(df)
att_combos <- data.table()
for(i in 2:length(cnames)){
combos <- combn(cnames, i)
for(x in 1:ncol(combos)){
df_sub <- unique(df[,combos[1:nrow(combos), x]])
att_combos <- rbind(att_combos, df_sub, fill = T)
}
}
rm(df_sub, i, x, combos, cnames)
for(i in 1:nrow(att_combos)){
att_sub <- att_combos[i, ]
att_sub <- att_sub[, is.na(att_sub)==F, with = F]
#need to subset data.frame here - very slow on large data.frames
#anyway to speed this up?
df_subset_for_analysis <- merge(df, att_sub)
}
答案 0 :(得分:2)
我会在您想要分组的列上使用data.table
keys
,然后使用您感兴趣的组合生成data.table
(在运行时),然后{{ 1}}两个。
以下是一个属性组合(merge
)和多个属性组合(simple_combinations
)的示例:
multiple_combinations
输出位于require(data.table)
df <- structure(list(att1 = c("e", "a", "c", "a", "d", "e", "a", "d", "b", "a", "c", "a", "b", "e", "e", "c", "d", "d", "a", "e", "b"),
att2 = c("b", "d", "c", "a", "e", "c", "e", "d", "e", "b", "e", "e", "c", "e", "a", "a", "e", "c", "b", "b", "d"),
att3 = c("c", "b", "e", "b", "d", "d", "d", "c", "c", "d", "e", "a", "d", "c", "e", "a", "d", "e", "d", "a", "e"),
att4 = c("c", "a", "b", "a", "e", "c", "a", "a", "b", "a", "a", "e", "c", "d", "b", "e", "b", "d", "d", "b", "e")),
.Names = c("att1", "att2", "att3", "att4"), class = "data.frame", row.names = c(NA, -21L))
# Convert to data.table
dt <- data.table(df)
# Set key on the columns used for "subsetting"
setkey(dt, att1, att2, att3, att4)
# Simple subset on a single set of attributes
simple_combinations <- data.table(att1 = "d", att2 = "e", att3 = "d", att4 = "e")
setkey(simple_combinations, att1, att2, att3, att4)
# Merge to generate simple output subset (simple_combinations of att present in dt)
simple_subset <- merge(dt, simple_combinations)
# Complex (multiple) sets of attributes
multiple_combinations <- data.table(expand.grid(att1=c("d"), att2=c("c", "d", "e"),
att3 = c("d"), att4 = c("b", "e")))
setkey(multiple_combinations, att1, att2, att3, att4)
# Merge to generate output subset (multiple_combinations of att present in dt)
multiple_subset <- merge(dt, multiple_combinations)
和simple_subset
。