Question

我想知道解决此问题的最佳方法是什么。从本质上讲，我想生成20个样本，它们加起来等于100，而且其中（x1 + x2> 20）。我正在努力获得快速高效的产品。我意识到我可以过滤出不符合此条件的行，但是如果我生成10,000而不是20的话，效率就不高。

代码如下：

n = 20
x1 = sample(0:100,n,replace = TRUE)
x2 = sample(0:100,n,replace = TRUE)
x3 = sample(0:100,n,replace = TRUE)
index = (x1+x2+x3)>100
G=(x1+x2)>20
while(sum(index)>0&&sum(G)>0){
   x1[index&&G] = sample(0:100,n,replace = TRUE)
   x2[index&&G] = sample(0:100,n,replace = TRUE)
   x3[index&&G] = sample(0:100,n,replace = TRUE)
index =(x1+x2+x3)>100
G=(x1+x2)>20
}
x4=rep(100,n)-x1-x2-x3

df <- data.frame(x1,x2,x3,x4)

谢谢。

Answer 1

另一种可能性：从序列0:100中选择三个中断。然后在这些中断之间生成x1，x2，x3和x4。如果x1 + x2小于20，则x3 + x4大于20，因此我们可以交换它们。

generate_four_numbers <- function(from = 0, to = 100) {
    breaks <- sort(sample(seq(from, to), 3 ,replace = TRUE))
    x1 <- breaks[1]
    x2 <- breaks[2] - breaks[1]
    x3 <- breaks[3] - breaks[2]
    x4 <- 100 - breaks[3]

    if (x1 + x2 <= 20) {
        return(data.frame(x1 = x4, x2 = x3, x3 = x2, x4 = x1)
    }

    data.frame(x1, x2, x3, x4)
}

res <- do.call(rbind, lapply(1:10000, function(x) generate_four_numbers()))

table(rowSums(res)) # all at 100

length(which(res$x1 + res$x2 > 20)) / nrow(res) # 100 % acceptable

Answer 2

这里是从0：n到k的n个数字的无偏方式。它基于stars and bars encoding：

#picks k random numbers in range 0:n which sum to n:

pick <- function(k,n){
  m <- n + k - 1 #number of stars and bars
  bars <- sort(sample(1:m,k-1)) #positions of the bars
  c(bars,m+1)-c(0,bars)-1
}

这将生成一个示例，并返回一个向量。正如@Guillaume Devailly在回答中所观察到的那样，大多数样本将满足前两个数字之和的附加约束，因此您可以过滤掉不符合条件的样本。

请注意，如果您想要4个数字在1：100范围内，这些数字加起来等于100，则可以使用1 + pick(4,96)。

要对前两个数字施加约束：

pick.sample <- function(){
  while(TRUE){
    x <- pick(4,100)
    if(sum(x[1:2]) >20) return(x)
  }
}

然后

df <- data.frame(t(replicate(10000,pick.sample())))

将创建一个10,000行的数据框，其中每一行都是一个满足约束条件的样本。

Answer 3

要生成一个这样的向量，可以执行以下操作：

# generate x1+x2
x1_plus_x2 <- sample.int(79,1) + 20
# generate x1 and x2 
x1x2 <- rmultinom(1, x1_plus_x2, c(1,1))
# generate x3 and x4
x3x4 <- rmultinom(1, 100-x1_plus_x2, c(1,1))
# generated x1,x2,x3,x4
x <- c(x1x2, x3x4)

您可以循环生成n个样本。您可以通过在开始时生成n的{{1}}值来提高速度：

x1+x2

Answer 4

您可以按照以下步骤轻松地强行使用它

#####
# Brute force solution
set.seed(28550697)
n <- 100000L
time. <- proc.time() # to measure time difference
brute <- t(replicate(
  n, {
    repeat {
      xs <- sample.int(101L, 4, replace = TRUE) - 1L
      if(xs[1] + xs[2] > 20L && sum(xs) == 100L)
        break
    }
    xs
  }))
proc.time() - time. # time taken
#R   user  system elapsed 
#R 192.76    0.13  196.74 

# check result
stopifnot(
  all(rowSums(brute) == 100L),
  all(brute %in% 0:100),
  all(brute[, 1] + brute[, 2] > 20L))

# only the first two columns should be able to take values in 0:100
apply(brute, 2, range)
#R      [,1] [,2] [,3] [,4]
#R [1,]    0    0    0    0
#R [2,]   99   99   79   79

以上，我在合理的时间内模拟了100,000对（比您的要求多10倍）。您当然可以使用更聪明的方法做得更好，但是很显然，这种分配是正确的。

R

4 个答案: