R返回所有因子的函数

时间:2011-06-21 11:58:47

标签: r factorization

我正常的搜索foo让我失望。我正在尝试找到一个R函数,它返回整数的所有因子。至少有2个包含factorize()函数的包:gmp和conf.design,但这些函数只返回素因子。我想要一个能够返回所有因子的函数。

显然,搜索这个很困难,因为R有一个叫做因子的结构,它会在搜索中产生很多噪音。

5 个答案:

答案 0 :(得分:19)

为了跟进我的评论(感谢@Ramnath我的错字),我的64位8演出机器上的强力方法似乎运行得相当好:

FUN <- function(x) {
    x <- as.integer(x)
    div <- seq_len(abs(x))
    factors <- div[x %% div == 0L]
    factors <- list(neg = -factors, pos = factors)
    return(factors)
}

一些例子:

> FUN(100)
$neg
[1]   -1   -2   -4   -5  -10  -20  -25  -50 -100

$pos
[1]   1   2   4   5  10  20  25  50 100

> FUN(-42)
$neg
[1]  -1  -2  -3  -6  -7 -14 -21 -42

$pos
[1]  1  2  3  6  7 14 21 42

#and big number

> system.time(FUN(1e8))
   user  system elapsed 
   1.95    0.18    2.14 

答案 1 :(得分:13)

您可以从主要因素中获取所有因素。 gmp非常快速地计算出来。

library(gmp)
library(plyr)

get_all_factors <- function(n)
{
  prime_factor_tables <- lapply(
    setNames(n, n), 
    function(i)
    {
      if(i == 1) return(data.frame(x = 1L, freq = 1L))
      plyr::count(as.integer(gmp::factorize(i)))
    }
  )
  lapply(
    prime_factor_tables, 
    function(pft)
    {
      powers <- plyr::alply(pft, 1, function(row) row$x ^ seq.int(0L, row$freq))
      power_grid <- do.call(expand.grid, powers)
      sort(unique(apply(power_grid, 1, prod)))
    }
  )
}

get_all_factors(c(1, 7, 60, 663, 2520, 75600, 15876000, 174636000, 403409160000))

答案 2 :(得分:7)

重大更新

以下是我最新的R分解算法。它更快,并向rle函数致敬。

算法3(已更新)

library(gmp)
MyFactors <- function(MyN) {
    myRle <- function (x1) {
        n1 <- length(x1)
        y1 <- x1[-1L] != x1[-n1]
        i <- c(which(y1), n1)
        list(lengths = diff(c(0L, i)), values = x1[i], uni = sum(y1)+1L)
    }

    if (MyN==1L) return(MyN)
    else {
        pfacs <- myRle(factorize(MyN))
        unip <- pfacs$values
        pv <- pfacs$lengths
        n <- pfacs$uni
        myf <- unip[1L]^(0L:pv[1L])
        if (n > 1L) {
            for (j in 2L:n) {
                myf <- c(myf, do.call(c,lapply(unip[j]^(1L:pv[j]), function(x) x*myf)))
            }
        }
    }
    myf[order(asNumeric(myf))]  ## 'order' is faster than 'sort.list'
}

以下是新的基准测试(正如Dirk Eddelbuettel所说here,“无法与经验争论。”):

案例1(大素因子)

set.seed(100)
myList <- lapply(1:10^3, function(x) sample(10^6, 10^5))
benchmark(SortList=lapply(myList, function(x) sort.list(x)),
            OrderFun=lapply(myList, function(x) order(x)),
            replications=3,
            columns = c("test", "replications", "elapsed", "relative"))
      test replications elapsed relative
2 OrderFun            3   59.41    1.000
1 SortList            3   61.52    1.036

## The times are limited by "gmp::factorize" and since it relies on
## pseudo-random numbers, the times can vary (i.e. one pseudo random
## number may lead to a factorization faster than others). With this
## in mind, any differences less than a half of second
## (or so) should be viewed as the same. 
x <- pow.bigz(2,256)+1
system.time(z1 <- MyFactors(x))
user  system elapsed
14.94    0.00   14.94
system.time(z2 <- all_divisors(x))      ## system.time(factorize(x))
user  system elapsed                    ##  user  system elapsed
14.94    0.00   14.96                   ## 14.94    0.00   14.94 
all(z1==z2)
[1] TRUE

x <- as.bigz("12345678987654321321")
system.time(x1 <- MyFactors(x^2))
user  system elapsed 
20.66    0.02   20.71
system.time(x2 <- all_divisors(x^2))    ## system.time(factorize(x^2))
user  system elapsed                    ##  user  system elapsed
20.69    0.00   20.69                   ## 20.67    0.00   20.67
all(x1==x2)
[1] TRUE

案例2(较小的数字)

set.seed(199)
samp <- sample(10^9, 10^5)
benchmark(JosephDivs=sapply(samp, MyFactors),
            DontasDivs=sapply(samp, all_divisors),
            OldDontas=sapply(samp, Oldall_divisors),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"),
            order = "relative")
        test replications elapsed relative
1 JosephDivs           10  470.31    1.000
2 DontasDivs           10  567.10    1.206  ## with vapply(..., USE.NAMES = FALSE)
3  OldDontas           10  626.19    1.331  ## with sapply

案例3(完全彻底)

set.seed(97)
samp <- sample(10^6, 10^4)
benchmark(JosephDivs=sapply(samp, MyFactors),
            DontasDivs=sapply(samp, all_divisors),
            CottonDivs=sapply(samp, get_all_factors),
            ChaseDivs=sapply(samp, FUN),
            replications=5,
            columns = c("test", "replications", "elapsed", "relative"),
            order = "relative")
        test replications elapsed relative
1 JosephDivs            5   22.68    1.000
2 DontasDivs            5   27.66    1.220
3 CottonDivs            5  126.66    5.585
4  ChaseDivs            5  554.25   24.438


原帖

先生。 Cotton的算法是一个非常好的R实现。蛮力方法只会让你到目前为止并且失败的数量很大(它也非常慢)。我提供了三种能满足不同需求的算法。第一个(我在1月15日发布并且稍微更新的原始算法)是一个独立的分解算法,它提供了一种高效,准确且可以轻松翻译成其他语言的组合方法。当您需要快速分解数千个数字时,第二个算法更像是一个非常快速且非常有用的筛子。第三个是一个简短的(上面发布的)但功能强大的独立算法,对于任何小于2 ^ 70的数字都是优越的(我几乎从原始代码中删除了所有内容)。我从Richie Cotton使用plyr::count函数中获得灵感(这激发了我编写自己的rle函数,其函数与plyr::count非常相似),George Dontas干净利落地处理琐碎的案例(即if (n==1) return(1)),以及Zelazny7提供的关于bigz向量的question的解决方案。

算法1(原创)

library(gmp)
factor2 <- function(MyN) {
    if (MyN == 1) return(1L)
    else {
        max_p_div <- factorize(MyN)
        prime_vec <- max_p_div <- max_p_div[sort.list(asNumeric(max_p_div))]
        my_factors <- powers <- as.bigz(vector())
        uni_p <- unique(prime_vec); maxp <- max(prime_vec)
        for (i in 1:length(uni_p)) {
            temp_size <- length(which(prime_vec == uni_p[i]))
            powers <- c(powers, pow.bigz(uni_p[i], 1:temp_size))
        }
        my_factors <- c(as.bigz(1L), my_factors, powers)
        temp_facs <- powers; r <- 2L
        temp_facs2 <- max_p_div2 <- as.bigz(vector())
        while (r <= length(uni_p)) {
            for (i in 1:length(temp_facs)) {
                a <- which(prime_vec >  max_p_div[i])
                temp <- mul.bigz(temp_facs[i], powers[a])
                temp_facs2 <- c(temp_facs2, temp)
                max_p_div2 <- c(max_p_div2, prime_vec[a])
            }
            my_sort <- sort.list(asNumeric(max_p_div2))
            temp_facs <- temp_facs2[my_sort]
            max_p_div <- max_p_div2[my_sort]
            my_factors <- c(my_factors, temp_facs)
            temp_facs2 <- max_p_div2 <- as.bigz(vector()); r <- r+1L
        }
    }
    my_factors[sort.list(asNumeric(my_factors))]
}

算法2(筛选)

EfficientFactorList <- function(n) {
    MyFactsList <- lapply(1:n, function(x) 1)
    for (j in 2:n) {
        for (r in seq.int(j, n, j)) {MyFactsList[[r]] <- c(MyFactsList[[r]], j)}
    }; MyFactsList}

它在不到2秒的时间内将每个数字的因式分解为1到100,000。为了让您了解此算法的效率,使用强力方法计算1 - 100,000的时间大约需要3分钟。

system.time(t1 <- EfficientFactorList(10^5))
user  system elapsed 
1.04    0.00    1.05 
system.time(t2 <- sapply(1:10^5, MyFactors))
user  system elapsed 
39.21    0.00   39.23 
system.time(t3 <- sapply(1:10^5, all_divisors))
user  system elapsed 
49.03    0.02   49.05

TheTest <- sapply(1:10^5, function(x) all(t2[[x]]==t3[[x]]) && all(asNumeric(t2[[x]])==t1[[x]]) && all(asNumeric(t3[[x]])==t1[[x]]))
all(TheTest)
[1] TRUE



最后的想法

先生。 Dontas关于分解大数字的原始评论让我思考,真正真正大数字......大概超过2 ^ 200。您会看到,无论您在此页面上选择哪种算法,它们都会花费很长时间,因为大多数算法都依赖于使用Pollard-Rho algorithmgmp::factorize。从此question开始,此算法仅适用于小于2 ^ 70的数字。我目前正在开发自己的 factorize 算法,该算法将实现Quadratic Sieve,这应该将所有这些算法提升到一个新的水平。

答案 3 :(得分:7)

以下方法可以提供正确的结果,即使在数字非常大的情况下(应该作为字符串传递)。它真的很快。

# TEST
# x <- as.bigz("12345678987654321")
# all_divisors(x)
# all_divisors(x*x)

# x <- pow.bigz(2,89)-1
# all_divisors(x)

library(gmp)
  options(scipen =30)

  sort_listz <- function(z) {
  #==========================
    z <- z[order(as.numeric(z))] # sort(z)
  } # function  sort_listz  


  mult_listz <- function(x,y) {
   do.call('c', lapply(y, function(i) i*x)) 
  } 


  all_divisors <- function(x) {
  #==========================  
  if (abs(x)<=1) return(x) 
  else {

    factorsz <- as.bigz(factorize(as.bigz(x))) # factorize returns up to
    # e.g. x= 12345678987654321  factors: 3 3 3 3 37 37 333667 333667

    factorsz <- sort_listz(factorsz) # vector of primes, sorted

    prime_factorsz <- unique(factorsz)
    #prime_ekt <- sapply(prime_factorsz, function(i) length( factorsz [factorsz==i]))
    prime_ekt <- vapply(prime_factorsz, function(i) sum(factorsz==i), integer(1), USE.NAMES=FALSE)
    spz <- vector() # keep all divisors 
    all <-1
    n <- length(prime_factorsz)
    for (i in 1:n) {
      pr <- prime_factorsz[i]
      pe <- prime_ekt[i]
      all <- all*(pe+1) #counts all divisors 

      prz <- as.bigz(pr)
      pse <- vector(mode="raw",length=pe+1) 
      pse <- c( as.bigz(1), prz)

      if (pe>1) {
        for (k in 2:pe) {
          prz <- prz*pr
          pse[k+1] <- prz
        } # for k
      } # if pe>1

      if (i>1) {
       spz <- mult_listz (spz, pse)         
      } else {
       spz <- pse;
      } # if i>1
    } #for n
    spz <- sort_listz (spz)

    return (spz)
  }  
  } # function  factors_all_divisors  

  #====================================

精制版,非常快。代码仍然简单,易读且易于使用。干净。

TEST

#Test 4 (big prime factor)
x <- pow.bigz(2,256)+1 # = 1238926361552897 * 93461639715357977769163558199606896584051237541638188580280321
 system.time(z2 <- all_divisors(x))
#   user  system elapsed 
 #  19.27    1.27   20.56


 #Test 5 (big prime factor)
x <- as.bigz("12345678987654321321") # = 3 * 19 * 216590859432531953

 system.time(x2 <- all_divisors(x^2))
#user  system elapsed 
 #25.65    0.00   25.67  

答案 4 :(得分:5)

自从这个问题最初被问到以来,R语言已经发生了很多变化。在0.6-3包的版本numbers中,包含了函数divisors,该函数对于获取数字的所有因子非常有用。它将满足大多数用户的需求,但是如果您正在寻找原始速度或者您正在使用更大的数字,则需要一种替代方法。我已经创作了两个新的软件包(部分受到这个问题的启发,我可能会补充),其中包含针对此类问题的高度优化的功能。第一个是RcppAlgos,另一个是bigIntegerAlgos

RcppAlgos

RcppAlgos包含两个函数,用于获得小于2^53 - 1的数字的除数:divisorsRcpp(用于快速获得许多数字的完全因式分解的向量化函数)&amp; divisorsSieve(快速生成范围内的完整因子分解)。首先,我们使用divisorsRcpp来计算许多随机数:

library(gmp)  ## for all_divisors by @GeorgeDontas
library(RcppAlgos)
library(numbers)
options(scipen = 999)
set.seed(42)
testSamp <- sample(10^10, 10)

## vectorized so you can pass the entire vector as an argument
testRcpp <- divisorsRcpp(testSamp)
testDontas <- lapply(testSamp, all_divisors)

identical(lapply(testDontas, as.numeric), testRcpp)
[1] TRUE

现在,使用divisorsSieve来计算范围内的多个数字:

system.time(testSieve <- divisorsSieve(10^13, 10^13 + 10^5))
 user  system elapsed 
0.586   0.014   0.602 

system.time(testDontasSieve <- lapply((10^13):(10^13 + 10^5), all_divisors))
  user  system elapsed 
54.327   0.187  54.655 

identical(lapply(testDontasSieve, asNumeric), testSieve)
[1] TRUE

divisorsRcppdivisorsSieve都是灵活高效的好功能,但它们仅限于2^53 - 1

bigIntegerAlgos

bigIntegerAlgos包具有两个功能divisorsBig&amp; quadraticSieve,专为非常大的数字而设计。它们直接链接到C library gmp。对于divisorsBig,我们有:

library(bigIntegerAlgos)
## testSamp is defined above... N.B. divisorsBig is not quite as
## efficient as divisorsRcpp. This is so because divisorsRcpp
## can take advantage of more efficient data types..... it is
## still blazing fast!! See the benchmarks below for reference.
testBig <- divisorsBig(testSamp)

identical(testDontas, testBig)
[1] TRUE

以下是我原始帖子中定义的基准(N.B. MyFactors已替换为divisorsRcppdivisorsBig)。

## Case 2
library(rbenchmark)
set.seed(199)
samp <- sample(10^9, 10^5)
benchmark(RcppAlgos=divisorsRcpp(samp),
          bigIntegerAlgos=divisorsBig(samp),
          DontasDivs=lapply(samp, all_divisors),
          replications=10,
          columns = c("test", "replications", "elapsed", "relative"),
          order = "relative")

             test replications elapsed relative
1       RcppAlgos           10   8.021    1.000
2 bigIntegerAlgos           10  15.246    1.901
3      DontasDivs           10 400.284   49.905

## Case 3
set.seed(97)
samp <- sample(10^6, 10^4)
benchmark(RcppAlgos=divisorsRcpp(samp),
          bigIntegerAlgos=divisorsBig(samp),
          numbers=lapply(samp, divisors),      ## From the numbers package
          DontasDivs=lapply(samp, all_divisors),
          CottonDivs=lapply(samp, get_all_factors),
          ChaseDivs=lapply(samp, FUN),
          replications=5,
          columns = c("test", "replications", "elapsed", "relative"),
          order = "relative")

             test replications elapsed relative
1       RcppAlgos            5   0.098    1.000
2 bigIntegerAlgos            5   0.330    3.367
3         numbers            5  11.613  118.500
4      DontasDivs            5  16.093  164.214
5      CottonDivs            5  60.428  616.612
6       ChaseDivs            5 342.608 3496.000

接下来的基准测试证明了基础算法在divisorsBig函数中的真正威力。被考虑的数字是10的幂,因此几乎可以完全忽略素数因子分解步骤(例如system.time(factorize(pow.bigz(10,30)))在我的机器上注册0。因此,时间上的差异仅仅取决于素因子可以多快组合以产生所有因素。

library(microbenchmark)
powTen <- pow.bigz(10,30)
microbenchmark(divisorsBig(powTen), all_divisors(powTen), unit = "relative")
Unit: relative
                expr      min      lq     mean   median       uq      max neval
 divisorsBig(powTen)  1.00000  1.0000  1.00000  1.00000  1.00000  1.00000   100
all_divisors(powTen) 32.35507 33.3054 33.28786 33.31253 32.11571 40.39236   100

## Negative numbers show an even greater increase in efficiency
negPowTen <- powTen * -1
microbenchmark(divisorsBig(negPowTen), all_divisors(negPowTen), unit = "relative")
Unit: relative
                   expr      min       lq    mean   median       uq      max neval
 divisorsBig(negPowTen)  1.00000  1.00000  1.0000  1.00000  1.00000  1.00000   100
all_divisors(negPowTen) 46.42795 46.22408 43.7964 47.93228 45.33406 26.64657   100


quadraticSieve

我将为您留下quadraticSieve的两次演示。

n5 <- as.bigz("94968915845307373740134800567566911")
system.time(print(quadraticSieve(n5)))
Big Integer ('bigz') object of length 2:
[1] 216366620575959221 438925910071081891
 user  system elapsed 
4.154   0.021   4.175   ## original time was 3.813167 mins or 228.8 seconds ~ 50x slower

n9 <- prod(nextprime(urand.bigz(2, 82, 42)))
system.time(print(quadraticSieve(n9)))
Big Integer ('bigz') object of length 2:
[1] 2128750292720207278230259 4721136619794898059404993
    user   system  elapsed 
1010.404    2.715 1013.184   ## original time was 12.9297 hours or 46,547 seconds ~ 46x slower

正如您所看到的,quadraticSieve比原始QuadSieveMultiPolysAll快得多,但仍有许多工作要做。正在进行的研究旨在改善这一功能,目前的目标是在一分钟内将n9分解。还计划对quadraticSieve进行矢量化以及将divisorsBigquadraticSieve进行整合,因为目前它仅限于gmp::factorize使用的相同算法。