R-创建多个方案组合

时间:2019-01-09 23:34:09

标签: r combinations

我有四个(部分重叠)的小组,每个小组有八个独特的申请人,他们分别申请了我必须分配的工作的20%,30%,40%和50%:

g20 <- c("a","b","c","d","e","f")
g30 <- c("a","b","c","d","e","f","g","h")
g40 <- c("c","d","e","f","g","h")
g50 <- c("e","f","g","h")

因为我只能按这四个增量来授予作品,并且我必须选择不少于两个人且不超过四个人,所以我有六个方案可以授予100%的作品:

  1. 50/50
  2. 50/30/20
  3. 40/40/20
  4. 40/30/30
  5. 40/20/20/20
  6. 30/30/20/20

对于每种情况,我需要找到所有可能的组合(无替换),以将作品授予相应组中的申请人。

我可以轻松地使用t(combn(g50,2))在第一种情况下完成此操作,但是我不确定如何处理其他不得不从不同向量中提取组合并确保在任何给定条件下仅选择一次申请人的其他情况组合。输出必须是实际的组合,而不仅仅是组合的数量。

使用R,我如何从四个不同的组中获取这些组合,并(以方案5为例)确保将“ cdef”,“ cedf”,“ cfed”,“ cfde”等都视为同样的结果?

这可能吗?

3 个答案:

答案 0 :(得分:2)

还创建了所有可能的组合,例如乔恩·斯普林(Jon Spring)的解决方案,但使用了软件包并删除了欺诈申请人。

如果您的实际尺寸是每个OP的价格,则可以考虑扩展到所有可能的组合,并删除申请人重复的行:

library(data.table)

g20 <- c("a","b","c","d","e","f")
g30 <- c("a","b","c","d","e","f","g","h")
g40 <- c("c","d","e","f","g","h")
g50 <- c("e","f","g","h")

scen <- paste0("g", c(30, 30, 20, 20))
allcombi <- do.call(CJ, mget(scen))
setnames(allcombi, paste0("V", 1L:length(allcombi)))

#remove rows with applicants that are repeated in different columns
nodupe <- allcombi[
    allcombi[, .I[anyDuplicated(unlist(.SD)) == 0L], 
        by=1:allcombi[,.N]]$V1]

#sort within columns with the same percentage of work
for(cols in split(names(nodupe), scen))
    nodupe[, (cols) := sort(.SD), by=seq_len(nodupe[,.N]), .SDcols=cols]

#remove identical combinations
ans <- unique(nodupe)
setnames(ans, scen)[]

输出:

     g30 g30 g20 g20
  1:   a   b   c   d
  2:   a   b   c   e
  3:   a   b   c   f
  4:   a   b   d   e
  5:   a   b   d   f
 ---                
221:   g   h   c   e
222:   g   h   c   f
223:   g   h   d   e
224:   g   h   d   f
225:   g   h   e   f

在所有6种情况下运行的代码和结果:

scenarios <- list(c(50,50), 
    c(50,30,20), 
    c(40,40,20), 
    c(40,30,30), 
    c(40,20,20,20), 
    c(30,30,20,20))

lapply(scenarios, 
    function(scen) {
        scen <- paste0("g", scen)
        allcombi <- do.call(CJ, mget(scen, envir=.GlobalEnv))
        setnames(allcombi, paste0("V", 1L:length(allcombi)))

        nodupe <- allcombi[
            allcombi[, .I[anyDuplicated(unlist(.SD)) == 0L], 
                by=1:allcombi[,.N]]$V1]

        for(cols in split(names(nodupe), scen))
            nodupe[, (cols) := sort(.SD), by=seq_len(nodupe[,.N]), .SDcols=cols]

        ans <- unique(nodupe)
        setnames(ans, scen)[]
})

输出:

[[1]]
   g50 g50
1:   e   f
2:   e   g
3:   e   h
4:   f   g
5:   f   h
6:   g   h

[[2]]
     g50 g30 g20
  1:   e   a   b
  2:   e   a   c
  3:   e   a   d
  4:   e   a   f
  5:   e   b   a
 ---            
128:   h   g   b
129:   h   g   c
130:   h   g   d
131:   h   g   e
132:   h   g   f

[[3]]
    g40 g40 g20
 1:   c   d   a
 2:   c   d   b
 3:   c   d   e
 4:   c   d   f
 5:   c   e   a
 6:   c   e   b
 7:   c   e   d
 8:   c   e   f
 9:   c   f   a
10:   c   f   b
11:   c   f   d
12:   c   f   e
13:   c   g   a
14:   c   g   b
15:   c   g   d
16:   c   g   e
17:   c   g   f
18:   c   h   a
19:   c   h   b
20:   c   h   d
21:   c   h   e
22:   c   h   f
23:   d   e   a
24:   d   e   b
25:   d   e   c
26:   d   e   f
27:   d   f   a
28:   d   f   b
29:   d   f   c
30:   d   f   e
31:   d   g   a
32:   d   g   b
33:   d   g   c
34:   d   g   e
35:   d   g   f
36:   d   h   a
37:   d   h   b
38:   d   h   c
39:   d   h   e
40:   d   h   f
41:   e   f   a
42:   e   f   b
43:   e   f   c
44:   e   f   d
45:   e   g   a
46:   e   g   b
47:   e   g   c
48:   e   g   d
49:   e   g   f
50:   e   h   a
51:   e   h   b
52:   e   h   c
53:   e   h   d
54:   e   h   f
55:   f   g   a
56:   f   g   b
57:   f   g   c
58:   f   g   d
59:   f   g   e
60:   f   h   a
61:   f   h   b
62:   f   h   c
63:   f   h   d
64:   f   h   e
65:   g   h   a
66:   g   h   b
67:   g   h   c
68:   g   h   d
69:   g   h   e
70:   g   h   f
    g40 g40 g20

[[4]]
     g40 g30 g30
  1:   c   a   b
  2:   c   a   d
  3:   c   a   e
  4:   c   a   f
  5:   c   a   g
 ---            
122:   h   d   f
123:   h   d   g
124:   h   e   f
125:   h   e   g
126:   h   f   g

[[5]]
    g40 g20 g20 g20
 1:   c   a   b   d
 2:   c   a   b   e
 3:   c   a   b   f
 4:   c   a   d   e
 5:   c   a   d   f
 6:   c   a   e   f
 7:   c   b   d   e
 8:   c   b   d   f
 9:   c   b   e   f
10:   c   d   e   f
11:   d   a   b   c
12:   d   a   b   e
13:   d   a   b   f
14:   d   a   c   e
15:   d   a   c   f
16:   d   a   e   f
17:   d   b   c   e
18:   d   b   c   f
19:   d   b   e   f
20:   d   c   e   f
21:   e   a   b   c
22:   e   a   b   d
23:   e   a   b   f
24:   e   a   c   d
25:   e   a   c   f
26:   e   a   d   f
27:   e   b   c   d
28:   e   b   c   f
29:   e   b   d   f
30:   e   c   d   f
31:   f   a   b   c
32:   f   a   b   d
33:   f   a   b   e
34:   f   a   c   d
35:   f   a   c   e
36:   f   a   d   e
37:   f   b   c   d
38:   f   b   c   e
39:   f   b   d   e
40:   f   c   d   e
41:   g   a   b   c
42:   g   a   b   d
43:   g   a   b   e
44:   g   a   b   f
45:   g   a   c   d
46:   g   a   c   e
47:   g   a   c   f
48:   g   a   d   e
49:   g   a   d   f
50:   g   a   e   f
51:   g   b   c   d
52:   g   b   c   e
53:   g   b   c   f
54:   g   b   d   e
55:   g   b   d   f
56:   g   b   e   f
57:   g   c   d   e
58:   g   c   d   f
59:   g   c   e   f
60:   g   d   e   f
61:   h   a   b   c
62:   h   a   b   d
63:   h   a   b   e
64:   h   a   b   f
65:   h   a   c   d
66:   h   a   c   e
67:   h   a   c   f
68:   h   a   d   e
69:   h   a   d   f
70:   h   a   e   f
71:   h   b   c   d
72:   h   b   c   e
73:   h   b   c   f
74:   h   b   d   e
75:   h   b   d   f
76:   h   b   e   f
77:   h   c   d   e
78:   h   c   d   f
79:   h   c   e   f
80:   h   d   e   f
    g40 g20 g20 g20

[[6]]
     g30 g30 g20 g20
  1:   a   b   c   d
  2:   a   b   c   e
  3:   a   b   c   f
  4:   a   b   d   e
  5:   a   b   d   f
 ---                
221:   g   h   c   e
222:   g   h   c   f
223:   g   h   d   e
224:   g   h   d   f
225:   g   h   e   f

答案 1 :(得分:1)

EDIT-根据对OP的仔细阅读更新了我的回复。现在,可以确定可以组成多少个不同的团队,而不必考虑如何在其中分配工作。

是的!这绝不是最优雅或最有效的解决方案,但有可能。此数据大约需要1秒钟,但是如果您拥有更复杂的真实数据,则速度会变慢。

首先,我为每个申请人确定了可能性。我认为以这种方式进行布局更直观,因为我们需要为每个申请人进行一次分配(包括零可能性)。

a <- c(0, 20, 30)
b <- c(0, 20, 30)
c <- c(0, 20, 30, 40)
d <- c(0, 20, 30, 40)
e <- c(0, 20, 30, 40, 50)
f <- c(0, 20, 30, 40, 50)
g <- c(0,     30, 40, 50)
h <- c(0,     30, 40, 50)

然后,我列举了使用expand.grid分配工作的所有可能性,然后进行过滤以仅包括完成100%工作的工作。

library(tidyverse)
soln_with_permutations <- expand.grid(a,b,c,d,e,f,g,h) %>%
  # the Applicants come in as Var1, Var2... here, will rename below
  as.tibble() %>%
  rownames_to_column() %>% # This number tracks each row / potential solution

  # gather into long format to make summing simpler
  gather(applicant, assignment, -rowname) %>%
  # rename Var1 as "a", Var2 as "b", and so on.
  mutate(applicant = str_sub(applicant, start = -1) %>% as.integer %>% letters[.]) %>%

  group_by(rowname) %>%
  # keep only solutions adding to 100%
  filter(sum(assignment) == 100) %>%
  # keep only solutions involving four or fewer applicants
  filter(sum(assignment > 0) <= 4) %>%
  ungroup()

每个rowname都按照申请人之间如何分配作品来描述一种独特的解决方案,但是许多排列方式是在同一团队之间分配作品的方式不同。为了查看组成了多少个不同的团队,以及该团队可以使用多少个不同的方案,我分别用团队(按字母顺序标记)和方案(按降序标记)标记每个解决方案。

soln_distinct_teams <- soln_with_permutations %>%
  filter(assignment > 0) %>%
  group_by(rowname) %>%
  # Get team composition, alphabetical
  mutate(team = paste0(applicant, collapse = "")) %>%
  # Get allocation structure, descending
  arrange(-assignment) %>%
  mutate(allocation = paste0(assignment, collapse = "/")) %>%
  ungroup() %>%

  # Distinct teams / allocations only
  distinct(team, allocation) %>%
  arrange(allocation, team) %>%
  mutate(soln_num = row_number()) %>%

  # select(soln_num, team, allocation) %>%
  spread(allocation, soln_num)

每一行显示了可以创建的2-4个申请人的132个不同团队之一,并且在各列中,我们看到了至少可以在一个排列中应用到该团队的不同方案。

# A tibble: 132 x 7
   team  `30/30/20/20` `40/20/20/20` `40/30/30` `40/40/20` `50/30/20` `50/50`
   <chr>         <int>         <int>      <int>      <int>      <int>   <int>
 1 abc              NA            NA        126         NA         NA      NA
 2 abcd              1            71         NA         NA         NA      NA
 3 abce              2            72         NA         NA         NA      NA
 4 abcf              3            73         NA         NA         NA      NA
 5 abcg              4            74         NA         NA         NA      NA
 6 abch              5            75         NA         NA         NA      NA
 7 abd              NA            NA        127         NA         NA      NA
 8 abde              6            76         NA         NA         NA      NA
 9 abdf              7            77         NA         NA         NA      NA
10 abdg              8            78         NA         NA         NA      NA
# ... with 122 more rows

答案 2 :(得分:0)

感谢您对此的所有帮助! chinsoon12的解决方案对我来说是最有用的。如前所述,该解决方案仍返回一些重复项(在40/40/20或40/30/30方案中,它没有删除重复率在该方案中出现两次的重复项)。

虽然可能不是最优雅的解决方案,但我修改了chinsoon12的解决方案。以40/40/20为例,我首先创建了40/40的所有可能组合,然后创建了40/40和20的组合。然后,我能够准确地删除重复项。

# Create 40/40 combos
combs_40 <- t(combn(g40,2))
c40 <- paste0(combs_40[,1],combs_40[,2])

# Create combos of 40/40 and 20
scen <- c("c40","g20")
allcombi <- do.call(CJ, mget(scen, envir=.GlobalEnv))
allcombi <- as.data.frame(allcombi)

# Split into cols
x <- t(as.data.frame(strsplit(allcombi$c40,split="")))
allcombi <- as.data.table(cbind(x[,1],x[,2],allcombi$g20))
setnames(allcombi, paste0("V", 1L:length(allcombi)))

# Remove rows with applicants that are repeated in different columns
nodupe <- allcombi[
  allcombi[, .I[anyDuplicated(unlist(.SD)) == 0L], 
           by=1:allcombi[,.N]]$V1]
# Redefine scen
scen <- c("g40","g40","g20")

# Sort within columns with the same percentage of work
for(cols in split(names(nodupe), scen))
  nodupe[, (cols) := sort(.SD), by=seq_len(nodupe[,.N]), .SDcols=cols]

# Set names, write results
setnames(nodupe, scen)[]
results_404020 <- nodupe