Question

快速，可扩展的方式将整数1到N转换为相应的字符串序列＆＃34; A＆＃34;，＆＃34; B＆＃34;，...＆＃34; Z＆＃34;，＆＃34; AA＆＃34;，＆＃34; AB＆＃34;，......长度相同？

或者，我很满意将整数向量映射到字符向量上，使得字符向量的每个元素具有相同数量的字符。例如。 1,2，... 27 =＆gt; ＆＃34; AA＆＃34;，＆＃34; AB＆＃34;，...，＆＃34; AZ＆＃34;，＆＃34; BA＆＃34;

示例输入：

num_vec <- seq(1, 1000)
char_vec <- ???

更新

我的hackish，但最好的工作尝试：

library(data.table)
myfunc <- function(n){
  if(n <= 26){
    dt <- CJ(LETTERS)[, Result := paste0(V1)]
  } else if(n <= 26^2){
    dt <- CJ(LETTERS, LETTERS)[, Result := paste0(V1, V2)]
  } else if(n <= 26^3){
    dt <- CJ(LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3)]
  } else if(n <= 26^4){
    dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4)]
  } else if(n <= 26^5){
    dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4, V5)]
  } else if(n <= 26^6){
    dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4, V5, V6)]
  } else{
    stop("n too large")
  }

  return(dt$Result[1:n])
}

myfunc(10)

Answer 1

评论中已经发布了几个不错的解决方案。只有@Gregor here发布的解决方案目前正在提供Ben的首选解决方案。

然而，@ eddi，@ DavidArenburg和@ G.Grothendieck发布的方法也可以进行调整以获得优先结果：

# adaptation of @eddi's method:
library(data.table)
n  <- 29
sz  <- ceiling(log(n)/log(26))
do.call(CJ, replicate(sz, c("", LETTERS), simplify = F))[-1, unique(Reduce(paste0, .SD))][1:n]

# adaptation of @DavidArenburg's method:
n <- 29
list(LETTERS, c(LETTERS, do.call(CJ, replicate((n - 1) %/% 26 + 1, LETTERS, simplify = FALSE))[, do.call(paste0, .SD)][1:(n-26)])[[(n>26)+1]]

# adaptation of @G.Grothendieck's method:
n  <- 29
sz  <- ceiling(log(n)/log(26))
g <- expand.grid(c('',LETTERS), rep(LETTERS, (sz-1)))
g <- g[order(g$Var1),]
do.call(paste0, g)[1:n]

这三个结果都是：

 [1] "A"  "B"  "C"  "D"  "E"  "F"  "G"  "H"  "I"  "J"  "K"  "L"  "M"  "N"  "O" 
[16] "P"  "Q"  "R"  "S"  "T"  "U"  "V"  "W"  "X"  "Y"  "Z"  "AA" "AB" "AC"

Answer 2

这似乎是Rcpp的绝佳候选人。以下是非常简单的方法：

// [[Rcpp::export]]
StringVector combVec(CharacterVector x, CharacterVector y) {
    int nx = x.size();
    int ny = y.size();
    CharacterVector z(nx*ny);
    int k = 0;
    for (int i = 0; i < nx; i++) {
        for (int j = 0; j < ny; j++) {
            z[k] = x[i];
            z[k] += y[j];
            k++;
        }
    }
    return z;  
}

NumChar <- function(n) {
    t <- trunc(log(n, 26))
    ch <- LETTERS
    for (i in t:1L) {ch <- combVec(ch, LETTERS)}
    ch[1:n]
}

结果正是OP的回答。

library(data.table)
Rcpp::sourceCpp('combVec.cpp')

identical(myfunc(100000), NumChar(100000))
[1] TRUE 

head(NumChar(100000))
[1] "AAAA" "AAAB" "AAAC" "AAAD" "AAAE" "AAAF"
tail(NumChar(100000))
[1] "FRXY" "FRXZ" "FRYA" "FRYB" "FRYC" "FRYD"

更新了基准，包括@ eddi的优秀Rcpp实施：

library(microbenchmark)

microbenchmark(myfunc(10000), funEddi(10000), NumChar(10000), excelCols(10000, LETTERS))
Unit: microseconds
                     expr       min        lq       mean     median        uq       max neval  cld
            myfunc(10000)  6632.125  7255.454  8441.7770  7912.4780  9283.660 14184.971   100   c 
           funEddi(10000) 12012.673 12869.928 15296.3838 13870.7050 16425.907 80443.142   100    d
           NumChar(10000)  2592.555  2883.394  3326.9292  3167.4995  3574.300  6051.273   100  b  
excelCols(10000, LETTERS)   636.165   656.820   782.7679   716.9225   811.148  1386.673   100 a 

microbenchmark(myfunc(100000), funEddi(100000), NumChar(100000), excelCols(100000, LETTERS), times = 10)
Unit: milliseconds
                     expr        min         lq       mean    median        uq       max neval  cld
            myfunc(1e+05) 203.992591 210.049303 255.049395 220.74955 262.52141 397.03521    10   c 
           funEddi(1e+05) 523.934475 530.646483 563.853995 552.83903 577.88915 688.84714    10    d
           NumChar(1e+05)  82.216802  83.546577  97.615537  93.63809 112.14316 115.84911    10  b  
excelCols(1e+05, LETTERS)   7.480882   8.377266   9.562554   8.93254  11.10519  14.11631    10 a

正如@DirkEddelbuettel所说＆＃34; Rcpp不是一些神奇的小马......＆＃34; 。这些效率差异只表明虽然Rcpp或任何相关的包装都非常棒，但他们不会修复糟糕的代码。感谢@eddi发布正确的Rcpp实施。

Answer 3

这是一个快速Rcpp解决方案，比原生R解决方案快几个数量级：

cppFunction('CharacterVector excelCols(int n, CharacterVector x) {
  CharacterVector res(n);
  int sz = x.size();
  std::string base;
  int baseN[100] = {0}; // being lazy about size here - you will never grow larger than this
  for (int i = 0; i < n; ++i) {
    bool incr = false;
    for (int j = base.size() - 1; j >= 0 && !incr; --j) {
      if (baseN[j] == sz) {
        baseN[j] = 1;
        base[j] = as<std::string>(x[0])[0];
      } else {
        baseN[j] += 1;
        base[j] = as<std::string>(x[baseN[j] - 1])[0];
        incr = true;
      }
    }
    if (!incr) {
      baseN[base.size()] = 1;
      base += x[0];
    }
    res[i] = base;
  }
  return res;
}')

excelCols(100, LETTERS)

将整数1,2,3，...的序列转换为字符串A，B，C的相应序列，

3 个答案: