我有一个带客户ID和三个变量不同组合的data.frame。我的第一步是找到通过以下代码实现的最常见组合。
possible_s = c("a","b","c","d","e")
n = 10000
df = tibble(customer_id = sample(1:10000,n,replace = T ),
s1 = sample(possible_s,n,replace = T ),
s2 = sample(possible_s,n,replace = T ),
s3 = sample(possible_s,n,replace = T ))
combinations = table(apply(df[,2:4], 1, function(x) paste0(sort(x), collapse = ""))) %>%
as_tibble() %>%
arrange(desc(n))
combinations = combinations %>%
mutate(
s1 = sapply(combinations[ ,1],function(x) substr(x,1,1)),
s2 = sapply(combinations[ ,1],function(x) substr(x,2,2)),
s3 = sapply(combinations[ ,1],function(x) substr(x,3,3))
)
现在,我要输出最常见的20种组合的ID。这是一种针对第一种组合手动执行此操作的解决方案:
combinations[1, ]
# A tibble: 1 x 5
Var1 n s1 s2 s3
<chr> <int> <chr> <chr> <chr>
1 cde 503 c d e
df %>%
mutate(count_c = rowSums(apply(df,2,function(x) str_count(x, 'c')) )) %>%
mutate(count_d = rowSums(apply(df,2,function(x) str_count(x, 'd')) )) %>%
mutate(count_e = rowSums(apply(df,2,function(x) str_count(x, 'e')) )) %>%
filter(count_c == 1,count_d == 1,count_e == 1) %>%
distinct(customer_id)
我尝试在前20个组合中执行以下操作:
sapply( as.data.frame ( t( combinations[1:20,3:5]),stringsAsFactors = F ) ,function(y)
df %>%
mutate(vfg1 = rowSums(apply(df,2,function(x) str_count(x,y[1])) )) %>%
mutate(vfg2 = rowSums(apply(df,2,function(x) str_count(x,y[2])) )) %>%
mutate(vfg3 = rowSums(apply(df,2,function(x) str_count(x,y[3])) ))
) %>%
filter(vfg1 == 1,vfg2 == 1,vfg3 == 1) %>%
distinct(VERSICHERTER_BP_ID)
不幸的是,这没有给我预期的结果。要将不同的ID绑定在一起,我想到了这样的东西:
do.call("bind",sapply( as.data.frame ( t( combinations[1:10,3:5]),stringsAsFactors = F ) ,function(y)
df %>%
mutate(vfg1 = rowSums(apply(df,2,function(x) str_count(x,y[1])) )) %>%
mutate(vfg2 = rowSums(apply(df,2,function(x) str_count(x,y[2])) )) %>%
mutate(vfg3 = rowSums(apply(df,2,function(x) str_count(x,y[3])) ))
) %>%
filter(vfg1 == 1,vfg2 == 1,vfg3 == 1) %>%
distinct(VERSICHERTER_BP_ID))
除了将20个第一个组合的所有ID保存在一个小标题中之外,我还想将每个组合的ID保存在一个列表中,其中每个组合和对应的ID都是列表的一个元素。
答案 0 :(得分:0)
尽管这可能不是最有效的方法。这是一个可行的解决方案:
top20_combinations = combinations %>% slice(1:20) %>% select(3:5)
customer_ids = vector("list", length = nrow(top20_combinations))
for (i in 1:nrow(top20_combinations)) {
df2 = df %>%
mutate(vfg1 = rowSums(apply(df,2,function(x) str_count(x, top20_combinations[[i,1]] )) )) %>%
mutate(vfg2 = rowSums(apply(df,2,function(x) str_count(x, top20_combinations[[i,2]] )) )) %>%
mutate(vfg3 = rowSums(apply(df,2,function(x) str_count(x, top20_combinations[[i,3]] )) )) %>%
filter(vfg1 == 1,vfg2 == 1,vfg3 == 1) %>%
distinct(customer_id )
customer_ids[[i]] = df2$customer_id
}