Question

我想要置换一个向量，以便元素在置换之后不能像在原始元素中那样位于相同的位置。让我们说我有一个像这样的元素列表：AABBCCADEF

有效的洗牌是：BBAADEFCCA

但这些都是无效的：B A ACFEDCAB或BCA B FEDCAB

我能找到的最接近的答案是：python shuffle such that position will never repeat。但这并不是我想要的，因为在这个例子中没有重复的元素。

我想要一种快速算法，在重复的情况下概括该答案。

MWE：

library(microbenchmark)

set.seed(1)
x <- sample(letters, size=295, replace=T)

terrible_implementation <- function(x) {
  xnew <- sample(x)
  while(any(x == xnew)) {
    xnew <- sample(x)
  }
  return(xnew)
}

microbenchmark(terrible_implementation(x), times=10)


Unit: milliseconds
                       expr      min       lq    mean  median       uq      max neval
 terrible_implementation(x) 479.5338 2346.002 4738.49 2993.29 4858.254 17005.05    10

另外，如何确定是否可以以这种方式置换序列？

编辑：为了清楚地说明我想要的东西，新的载体应满足以下条件：

1）all(table(newx) == table(x)) 2）all(x != newx)

E.g：

newx <- terrible_implementation(x)
all(table(newx) == table(x))
[1] TRUE
all(x != newx)
[1] TRUE

Answer 1

#DATA
set.seed(1)
x <- sample(letters, size=295, replace=T)

foo = function(S){
    if(max(table(S)) > length(S)/2){
        stop("NOT POSSIBLE")
    }
    U = unique(S)
    done_chrs = character(0)
    inds = integer(0)
    ans = character(0)
    while(!identical(sort(done_chrs), sort(U))){
        my_chrs = U[!U %in% done_chrs]
        next_chr = my_chrs[which.min(sapply(my_chrs, function(x) length(setdiff(which(!S %in% x), inds))))]
        x_inds = which(S %in% next_chr)
        candidates = setdiff(seq_along(S), union(x_inds, inds))
        if (length(candidates) == 1){
            new_inds = candidates
        }else{
            new_inds = sample(candidates, length(x_inds))
        }
        inds = c(inds, new_inds)
        ans[new_inds] = next_chr
        done_chrs = c(done_chrs, next_chr)
    }
    return(ans)
}

ans_foo = foo(x)

identical(sort(ans_foo), sort(x)) & !any(ans_foo == x)
#[1] TRUE

library(microbenchmark)
microbenchmark(foo(x))
#Unit: milliseconds
#   expr      min       lq     mean   median       uq      max neval
# foo(x) 19.49833 22.32517 25.65675 24.85059 27.96838 48.61194   100

Answer 2

我认为这符合您的所有条件。我们的想法是按频率排序，从最常见的元素开始，并将值移动到频率表中的下一个值，即最常见元素出现的次数。这将保证错过所有元素。

我用data.table写的，因为它在调试过程中帮助了我，但没有失去太多的性能。这是性能方面的适度改进。

library(data.table)
library(magrittr)
library(microbenchmark)


permute_avoid_same_position <- function(y) {
  DT <- data.table(orig = y)
  DT[, orig_order := .I]

  count_by_letter <- 
    DT[, .N, keyby = orig] %>%
    .[order(N)] %>%
    .[, stable_order := .I] %>%
    .[order(-stable_order)] %>%
    .[]

  out <- copy(DT)[count_by_letter, .(orig, orig_order, N), on = "orig"]
  # Dummy element
  out[, new := first(y)]
  origs <- out[["orig"]]
  nrow_out <- nrow(out)
  maxN <- count_by_letter[["N"]][1]

  out[seq_len(nrow_out) > maxN, new := head(origs, nrow_out - maxN)]
  out[seq_len(nrow_out) <= maxN, new := tail(origs, maxN)]

  DT[out, j = .(orig_order, orig, new), on = "orig_order"] %>%
    .[order(orig_order)] %>%
    .[["new"]]
}

set.seed(1)
x <- sample(letters, size=295, replace=T)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))
microbenchmark(permute_avoid_same_position(x), times = 5)

# Unit: milliseconds
#                           expr      min       lq     mean   median       uq      max
# permute_avoid_same_position(x) 5.650378 5.771753 5.875116 5.788618 5.938604 6.226228

x <- sample(1:1000, replace = TRUE, size = 1e6)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))

microbenchmark(permute_avoid_same_position(x), times = 5)
# Unit: milliseconds
#                           expr      min       lq    mean   median       uq      max
# permute_avoid_same_position(x) 239.7744 385.4686 401.521 438.2999 440.9746 503.0875

Answer 3

我们可以通过重复元素sample和replicate

的边界提取子字符串

library(stringr)
sapply(replicate(10, sample(str_extract_all(str1, "([[:alpha:]])\\1*")[[1]]),
                simplify = FALSE), paste, collapse="")
#[1] "BBAAEFDCCA" "AAAFBBEDCC" "BBAAAEFCCD" "DFACCBBAAE" "AAFCCBBEAD" 
#[6] "DAAAECCBBF" "AAFCCDBBEA" "CCEFADBBAA" "BBAAEADCCF" "AACCBBDFAE"

数据

str1 <- "AABBCCADEF"

置换矢量，使得元素不能在同一个地方

3 个答案:

数据