显然,由于R中有一个称为因子的结构,这使得搜索变得困难。
关于我的评论(感谢 @Ramnath 指出我的笔误),在我的 64 位 8 Gb 机器上,暴力方法似乎表现得相当不错:
FUN <- function(x) {
x <- as.integer(x)
div <- seq_len(abs(x))
factors <- div[x %% div == 0L]
factors <- list(neg = -factors, pos = factors)
return(factors)
}
以下是几个示例:
> FUN(100)
$neg
[1] -1 -2 -4 -5 -10 -20 -25 -50 -100
$pos
[1] 1 2 4 5 10 20 25 50 100
> FUN(-42)
$neg
[1] -1 -2 -3 -6 -7 -14 -21 -42
$pos
[1] 1 2 3 6 7 14 21 42
#and big number
> system.time(FUN(1e8))
user system elapsed
1.95 0.18 2.14
div[x %% div == 0L]
和 x <- as.integer(x)
。速度应该会快2倍。 - Marekdiv <- seq_len(abs(x / 2))
,这应该只需要不到一半的时间。 - ceiling catx %% div == 0L
,你可以直接进行否定,例如:!x%%div
。不过代码可读性可能会降低。 - user5957401gmp
可以快速计算这些因数。library(gmp)
library(plyr)
get_all_factors <- function(n)
{
prime_factor_tables <- lapply(
setNames(n, n),
function(i)
{
if(i == 1) return(data.frame(x = 1L, freq = 1L))
plyr::count(as.integer(gmp::factorize(i)))
}
)
lapply(
prime_factor_tables,
function(pft)
{
powers <- plyr::alply(pft, 1, function(row) row$x ^ seq.int(0L, row$freq))
power_grid <- do.call(expand.grid, powers)
sort(unique(apply(power_grid, 1, prod)))
}
)
}
get_all_factors(c(1, 7, 60, 663, 2520, 75600, 15876000, 174636000, 403409160000))
system.time(get_all_factors(1e8))
的执行时间为0秒。 - Richie Cottonget_all_factors
的输入可以使用整数或数值向量。 - Richie Cotton现在已经在包RcppBigIntAlgos
中实现。有关更多详细信息,请参见此答案。
该算法已经完全更新,现在实现了多项式以及一些聪明的筛选技术,可以消除数百万次检查。除了原始链接外,这篇论文以及来自primo的这篇文章在最后阶段非常有帮助(对primo表示赞叹)。Primo在相对较短的空间内很好地解释了QS的要点,并编写了一个非常惊人的算法(它将在不到2秒钟的时间内因子化底部的数字38!+1!!疯狂!!)。
如承诺所示,以下是我谦卑的R实现Quadratic Sieve。自我承诺以来,我一直零散地研究这个算法,自今年1月底以来。我不会尝试充分解释它(除非要求...此外,下面的链接做得非常好),因为它非常复杂,希望我的函数名称代表它们自己。从程序员和数学角度来看,这被证明是我曾经尝试执行的最具挑战性的算法之一。我阅读了无数论文,最终发现这五篇论文最有帮助(QSieve1,QSieve2,QSieve3,QSieve4,QSieve5)。
注意:目前这个算法作为一般的质因数分解算法并不太适用。如果进行进一步优化,需要有一段代码来找出更小的质数(即小于10^5,如此帖所建议),然后调用QuadSieveAll,检查它们是否是质数,如果不是,再对这两个因子分别调用QuadSieveAll等等,直到只剩下所有质数(所有这些步骤都不是很难)。然而,本文的主要重点是突出了二次筛法的核心,因此下面的示例都是半素数(尽管它将分解大多数不包含平方的奇数……另外,我还没有见过一个QS的示例展示了非半素数)。 我知道OP正在寻找返回所有因子而不是质因数分解的方法,但是如果这个算法(如果进一步优化)与上面的算法之一结合起来,作为一般的分解算法会非常强大(特别是考虑到OP需要的是Project Euler,通常需要比暴力方法更多的东西)。顺便说一句,MyIntToBit
函数是这个答案的变体,PrimeSieve
来自Dontas几年前发表的一篇帖子(也值得称赞)。QuadSieveMultiPolysAll <- function(MyN, fudge1=0L, fudge2=0L, LenB=0L) {
### 'MyN' is the number to be factored; 'fudge1' is an arbitrary number
### that is used to determine the size of your prime base for sieving;
### 'fudge2' is used to set a threshold for sieving;
### 'LenB' is a the size of the sieving interval. The last three
### arguments are optional (they are determined based off of the
### size of MyN if left blank)
### The first 8 functions are helper functions
PrimeSieve <- function(n) {
n <- as.integer(n)
if (n > 1e9) stop("n too large")
primes <- rep(TRUE, n)
primes[1] <- FALSE
last.prime <- 2L
fsqr <- floor(sqrt(n))
while (last.prime <= fsqr) {
primes[seq.int(last.prime^2, n, last.prime)] <- FALSE
sel <- which(primes[(last.prime + 1):(fsqr + 1)])
if (any(sel)) {
last.prime <- last.prime + min(sel)
} else {
last.prime <- fsqr + 1
}
}
MyPs <- which(primes)
rm(primes)
gc()
MyPs
}
MyIntToBit <- function(x, dig) {
i <- 0L
string <- numeric(dig)
while (x > 0) {
string[dig - i] <- x %% 2L
x <- x %/% 2L
i <- i + 1L
}
string
}
ExpBySquaringBig <- function(x, n, p) {
if (n == 1) {
MyAns <- mod.bigz(x,p)
} else if (mod.bigz(n,2)==0) {
MyAns <- ExpBySquaringBig(mod.bigz(pow.bigz(x,2),p),div.bigz(n,2),p)
} else {
MyAns <- mod.bigz(mul.bigz(x,ExpBySquaringBig(mod.bigz(
pow.bigz(x,2),p), div.bigz(sub.bigz(n,1),2),p)),p)
}
MyAns
}
TonelliShanks <- function(a,p) {
P1 <- sub.bigz(p,1); j <- 0L; s <- P1
while (mod.bigz(s,2)==0L) {s <- s/2; j <- j+1L}
if (j==1L) {
MyAns1 <- ExpBySquaringBig(a,(p+1L)/4,p)
MyAns2 <- mod.bigz(-1 * ExpBySquaringBig(a,(p+1L)/4,p),p)
} else {
n <- 2L
Legendre2 <- ExpBySquaringBig(n,P1/2,p)
while (Legendre2==1L) {n <- n+1L; Legendre2 <- ExpBySquaringBig(n,P1/2,p)}
x <- ExpBySquaringBig(a,(s+1L)/2,p)
b <- ExpBySquaringBig(a,s,p)
g <- ExpBySquaringBig(n,s,p)
r <- j; m <- 1L
Test <- mod.bigz(b,p)
while (!(Test==1L) && !(m==0L)) {
m <- 0L
Test <- mod.bigz(b,p)
while (!(Test==1L)) {m <- m+1L; Test <- ExpBySquaringBig(b,pow.bigz(2,m),p)}
if (!m==0) {
x <- mod.bigz(x * ExpBySquaringBig(g,pow.bigz(2,r-m-1L),p),p)
g <- ExpBySquaringBig(g,pow.bigz(2,r-m),p)
b <- mod.bigz(b*g,p); r <- m
}; Test <- 0L
}; MyAns1 <- x; MyAns2 <- mod.bigz(p-x,p)
}
c(MyAns1, MyAns2)
}
SieveLists <- function(facLim, FBase, vecLen, sieveD, MInt) {
vLen <- ceiling(vecLen/2); SecondHalf <- (vLen+1L):vecLen
MInt1 <- MInt[1:vLen]; MInt2 <- MInt[SecondHalf]
tl <- vector("list",length=facLim)
for (m in 3:facLim) {
st1 <- mod.bigz(MInt1[1],FBase[m])
m1 <- 1L+as.integer(mod.bigz(sieveD[[m]][1] - st1,FBase[m]))
m2 <- 1L+as.integer(mod.bigz(sieveD[[m]][2] - st1,FBase[m]))
sl1 <- seq.int(m1,vLen,FBase[m])
sl2 <- seq.int(m2,vLen,FBase[m])
tl1 <- list(sl1,sl2)
st2 <- mod.bigz(MInt2[1],FBase[m])
m3 <- vLen+1L+as.integer(mod.bigz(sieveD[[m]][1] - st2,FBase[m]))
m4 <- vLen+1L+as.integer(mod.bigz(sieveD[[m]][2] - st2,FBase[m]))
sl3 <- seq.int(m3,vecLen,FBase[m])
sl4 <- seq.int(m4,vecLen,FBase[m])
tl2 <- list(sl3,sl4)
tl[[m]] <- list(tl1,tl2)
}
tl
}
SieverMod <- function(facLim, FBase, vecLen, SD, MInt, FList, LogFB, Lim, myCol) {
MyLogs <- rep(0,nrow(SD))
for (m in 3:facLim) {
MyBool <- rep(FALSE,vecLen)
MyBool[c(FList[[m]][[1]][[1]],FList[[m]][[2]][[1]])] <- TRUE
MyBool[c(FList[[m]][[1]][[2]],FList[[m]][[2]][[2]])] <- TRUE
temp <- which(MyBool)
MyLogs[temp] <- MyLogs[temp] + LogFB[m]
}
MySieve <- which(MyLogs > Lim)
MInt <- MInt[MySieve]; NewSD <- SD[MySieve,]
newLen <- length(MySieve); GoForIT <- FALSE
MyMat <- matrix(integer(0),nrow=newLen,ncol=myCol)
MyMat[which(NewSD[,1L] < 0),1L] <- 1L; MyMat[which(NewSD[,1L] > 0),1L] <- 0L
if ((myCol-1L) - (facLim+1L) > 0L) {MyMat[,((facLim+2L):(myCol-1L))] <- 0L}
if (newLen==1L) {MyMat <- matrix(MyMat,nrow=1,byrow=TRUE)}
if (newLen > 0L) {
GoForIT <- TRUE
for (m in 1:facLim) {
vec <- rep(0L,newLen)
temp <- which((NewSD[,1L]%%FBase[m])==0L)
NewSD[temp,] <- NewSD[temp,]/FBase[m]; vec[temp] <- 1L
test <- temp[which((NewSD[temp,]%%FBase[m])==0L)]
while (length(test)>0L) {
NewSD[test,] <- NewSD[test,]/FBase[m]
vec[test] <- (vec[test]+1L)
test <- test[which((NewSD[test,]%%FBase[m])==0L)]
}
MyMat[,m+1L] <- vec
}
}
list(MyMat,NewSD,MInt,GoForIT)
}
reduceMatrix <- function(mat) {
tempMin <- 0L; n1 <- ncol(mat); n2 <- nrow(mat)
mymax <- 1L
for (i in 1:n1) {
temp <- which(mat[,i]==1L)
t <- which(temp >= mymax)
if (length(temp)>0L && length(t)>0L) {
MyMin <- min(temp[t])
if (!(MyMin==mymax)) {
vec <- mat[MyMin,]
mat[MyMin,] <- mat[mymax,]
mat[mymax,] <- vec
}
t <- t[-1]; temp <- temp[t]
for (j in temp) {mat[j,] <- (mat[j,]+mat[mymax,])%%2L}
mymax <- mymax+1L
}
}
if (mymax<n2) {simpMat <- mat[-(mymax:n2),]} else {simpMat <- mat}
lenSimp <- nrow(simpMat)
if (is.null(lenSimp)) {lenSimp <- 0L}
mycols <- 1:n1
if (lenSimp>1L) {
## "Diagonalizing" Matrix
for (i in 1:lenSimp) {
if (all(simpMat[i,]==0L)) {simpMat <- simpMat[-i,]; next}
if (!simpMat[i,i]==1L) {
t <- min(which(simpMat[i,]==1L))
vec <- simpMat[,i]; tempCol <- mycols[i]
simpMat[,i] <- simpMat[,t]; mycols[i] <- mycols[t]
simpMat[,t] <- vec; mycols[t] <- tempCol
}
}
lenSimp <- nrow(simpMat); MyList <- vector("list",length=n1)
MyFree <- mycols[which((1:n1)>lenSimp)]; for (i in MyFree) {MyList[[i]] <- i}
if (is.null(lenSimp)) {lenSimp <- 0L}
if (lenSimp>1L) {
for (i in lenSimp:1L) {
t <- which(simpMat[i,]==1L)
if (length(t)==1L) {
simpMat[ ,t] <- 0L
MyList[[mycols[i]]] <- 0L
} else {
t1 <- t[t>i]
if (all(t1 > lenSimp)) {
MyList[[mycols[i]]] <- MyList[[mycols[t1[1]]]]
if (length(t1)>1) {
for (j in 2:length(t1)) {MyList[[mycols[i]]] <- c(MyList[[mycols[i]]], MyList[[mycols[t1[j]]]])}
}
}
else {
for (j in t1) {
if (length(MyList[[mycols[i]]])==0L) {MyList[[mycols[i]]] <- MyList[[mycols[j]]]}
else {
e1 <- which(MyList[[mycols[i]]]%in%MyList[[mycols[j]]])
if (length(e1)==0) {
MyList[[mycols[i]]] <- c(MyList[[mycols[i]]],MyList[[mycols[j]]])
} else {
e2 <- which(!MyList[[mycols[j]]]%in%MyList[[mycols[i]]])
MyList[[mycols[i]]] <- MyList[[mycols[i]]][-e1]
if (length(e2)>0L) {MyList[[mycols[i]]] <- c(MyList[[mycols[i]]], MyList[[mycols[j]]][e2])}
}
}
}
}
}
}
TheList <- lapply(MyList, function(x) {if (length(x)==0L) {0} else {x}})
list(TheList,MyFree)
} else {
list(NULL,NULL)
}
} else {
list(NULL,NULL)
}
}
GetFacs <- function(vec1, vec2, n) {
x <- mod.bigz(prod.bigz(vec1),n)
y <- mod.bigz(prod.bigz(vec2),n)
MyAns <- c(gcd.bigz(x-y,n),gcd.bigz(x+y,n))
MyAns[sort.list(asNumeric(MyAns))]
}
SolutionSearch <- function(mymat, M2, n, FB) {
colTest <- which(apply(mymat, 2, sum) == 0)
if (length(colTest) > 0) {solmat <- mymat[ ,-colTest]} else {solmat <- mymat}
if (length(nrow(solmat)) > 0) {
nullMat <- reduceMatrix(t(solmat %% 2L))
listSol <- nullMat[[1]]; freeVar <- nullMat[[2]]; LF <- length(freeVar)
} else {LF <- 0L}
if (LF > 0L) {
for (i in 2:min(10^8,(2^LF + 1L))) {
PosAns <- MyIntToBit(i, LF)
posVec <- sapply(listSol, function(x) {
t <- which(freeVar %in% x)
if (length(t)==0L) {
0
} else {
sum(PosAns[t])%%2L
}
})
ansVec <- which(posVec==1L)
if (length(ansVec)>0) {
if (length(ansVec) > 1L) {
myY <- apply(mymat[ansVec,],2,sum)
} else {
myY <- mymat[ansVec,]
}
if (sum(myY %% 2) < 1) {
myY <- as.integer(myY/2)
myY <- pow.bigz(FB,myY[-1])
temp <- GetFacs(M2[ansVec], myY, n)
if (!(1==temp[1]) && !(1==temp[2])) {
return(temp)
}
}
}
}
}
}
### Below is the main portion of the Quadratic Sieve
BegTime <- Sys.time(); MyNum <- as.bigz(MyN); DigCount <- nchar(as.character(MyN))
P <- PrimeSieve(10^5)
SqrtInt <- .mpfr2bigz(trunc(sqrt(mpfr(MyNum,sizeinbase(MyNum,b=2)+5L))))
if (DigCount < 24) {
DigSize <- c(4,10,15,20,23)
f_Pos <- c(0.5,0.25,0.15,0.1,0.05)
MSize <- c(5000,7000,10000,12500,15000)
if (fudge1==0L) {
LM1 <- lm(f_Pos ~ DigSize)
m1 <- summary(LM1)$coefficients[2,1]
b1 <- summary(LM1)$coefficients[1,1]
fudge1 <- DigCount*m1 + b1
}
if (LenB==0L) {
LM2 <- lm(MSize ~ DigSize)
m2 <- summary(LM2)$coefficients[2,1]
b2 <- summary(LM2)$coefficients[1,1]
LenB <- ceiling(DigCount*m2 + b2)
}
LimB <- trunc(exp((.5+fudge1)*sqrt(log(MyNum)*log(log(MyNum)))))
B <- P[P<=LimB]; B <- B[-1]
facBase <- P[which(sapply(B, function(x) ExpBySquaringBig(MyNum,(x-1)/2,x)==1L))+1L]
LenFBase <- length(facBase)+1L
} else if (DigCount < 67) {
## These values were obtained from "The Multiple Polynomial
## Quadratic Sieve" by Robert D. Silverman
DigSize <- c(24,30,36,42,48,54,60,66)
FBSize <- c(100,200,400,900,1200,2000,3000,4500)
MSize <- c(5,25,25,50,100,250,350,500)
LM1 <- loess(FBSize ~ DigSize)
LM2 <- loess(MSize ~ DigSize)
if (fudge1==0L) {
fudge1 <- -0.4
LimB <- trunc(exp((.5+fudge1)*sqrt(log(MyNum)*log(log(MyNum)))))
myTarget <- ceiling(predict(LM1, DigCount))
while (LimB < myTarget) {
LimB <- trunc(exp((.5+fudge1)*sqrt(log(MyNum)*log(log(MyNum)))))
fudge1 <- fudge1+0.001
}
B <- P[P<=LimB]; B <- B[-1]
facBase <- P[which(sapply(B, function(x) ExpBySquaringBig(MyNum,(x-1)/2,x)==1L))+1L]
LenFBase <- length(facBase)+1L
while (LenFBase < myTarget) {
fudge1 <- fudge1+0.005
LimB <- trunc(exp((.5+fudge1)*sqrt(log(MyNum)*log(log(MyNum)))))
myind <- which(P==max(B))+1L
myset <- tempP <- P[myind]
while (tempP < LimB) {
myind <- myind + 1L
tempP <- P[myind]
myset <- c(myset, tempP)
}
for (p in myset) {
t <- ExpBySquaringBig(MyNum,(p-1)/2,p)==1L
if (t) {facBase <- c(facBase,p)}
}
B <- c(B, myset)
LenFBase <- length(facBase)+1L
}
} else {
LimB <- trunc(exp((.5+fudge1)*sqrt(log(MyNum)*log(log(MyNum)))))
B <- P[P<=LimB]; B <- B[-1]
facBase <- P[which(sapply(B, function(x) ExpBySquaringBig(MyNum,(x-1)/2,x)==1L))+1L]
LenFBase <- length(facBase)+1L
}
if (LenB==0L) {LenB <- 1000*ceiling(predict(LM2, DigCount))}
} else {
return("The number you've entered is currently too big for this algorithm!!")
}
SieveDist <- lapply(facBase, function(x) TonelliShanks(MyNum,x))
SieveDist <- c(1L,SieveDist); SieveDist[[1]] <- c(SieveDist[[1]],1L); facBase <- c(2L,facBase)
Lower <- -LenB; Upper <- LenB; LenB2 <- 2*LenB+1L; MyInterval <- Lower:Upper
M <- MyInterval + SqrtInt ## Set that will be tested
SqrDiff <- matrix(sub.bigz(pow.bigz(M,2),MyNum),nrow=length(M),ncol=1L)
maxM <- max(MyInterval)
LnFB <- log(facBase)
## N.B. primo uses 0.735, as his siever
## is more efficient than the one employed here
if (fudge2==0L) {
if (DigCount < 8) {
fudge2 <- 0
} else if (DigCount < 12) {
fudge2 <- .7
} else if (DigCount < 20) {
fudge2 <- 1.3
} else {
fudge2 <- 1.6
}
}
TheCut <- log10(maxM*sqrt(2*asNumeric(MyNum)))*fudge2
myPrimes <- as.bigz(facBase)
CoolList <- SieveLists(LenFBase, facBase, LenB2, SieveDist, MyInterval)
GetMatrix <- SieverMod(LenFBase, facBase, LenB2, SqrDiff, M, CoolList, LnFB, TheCut, LenFBase+1L)
if (GetMatrix[[4]]) {
newmat <- GetMatrix[[1]]; NewSD <- GetMatrix[[2]]; M <- GetMatrix[[3]]
NonSplitFacs <- which(abs(NewSD[,1L])>1L)
newmat <- newmat[-NonSplitFacs, ]
M <- M[-NonSplitFacs]
lenM <- length(M)
if (class(newmat) == "matrix") {
if (nrow(newmat) > 0) {
PosAns <- SolutionSearch(newmat,M,MyNum,myPrimes)
} else {
PosAns <- vector()
}
} else {
newmat <- matrix(newmat, nrow = 1)
PosAns <- vector()
}
} else {
newmat <- matrix(integer(0),ncol=(LenFBase+1L))
PosAns <- vector()
}
Atemp <- .mpfr2bigz(trunc(sqrt(sqrt(mpfr(2*MyNum))/maxM)))
if (Atemp < max(facBase)) {Atemp <- max(facBase)}; myPoly <- 0L
while (length(PosAns)==0L) {LegTest <- TRUE
while (LegTest) {
Atemp <- nextprime(Atemp)
Legendre <- asNumeric(ExpBySquaringBig(MyNum,(Atemp-1L)/2,Atemp))
if (Legendre == 1) {LegTest <- FALSE}
}
A <- Atemp^2
Btemp <- max(TonelliShanks(MyNum, Atemp))
B2 <- (Btemp + (MyNum - Btemp^2) * inv.bigz(2*Btemp,Atemp))%%A
C <- as.bigz((B2^2 - MyNum)/A)
myPoly <- myPoly + 1L
polySieveD <- lapply(1:LenFBase, function(x) {
AInv <- inv.bigz(A,facBase[x])
asNumeric(c(((SieveDist[[x]][1]-B2)*AInv)%%facBase[x],
((SieveDist[[x]][2]-B2)*AInv)%%facBase[x]))
})
M1 <- A*MyInterval + B2
SqrDiff <- matrix(A*pow.bigz(MyInterval,2) + 2*B2*MyInterval + C,nrow=length(M1),ncol=1L)
CoolList <- SieveLists(LenFBase, facBase, LenB2, polySieveD, MyInterval)
myPrimes <- c(myPrimes,Atemp)
LenP <- length(myPrimes)
GetMatrix <- SieverMod(LenFBase, facBase, LenB2, SqrDiff, M1, CoolList, LnFB, TheCut, LenP+1L)
if (GetMatrix[[4]]) {
n2mat <- GetMatrix[[1]]; N2SD <- GetMatrix[[2]]; M1 <- GetMatrix[[3]]
n2mat[,LenP+1L] <- rep(2L,nrow(N2SD))
if (length(N2SD) > 0) {NonSplitFacs <- which(abs(N2SD[,1L])>1L)} else {NonSplitFacs <- LenB2}
if (length(NonSplitFacs)<2*LenB) {
M1 <- M1[-NonSplitFacs]; lenM1 <- length(M1)
n2mat <- n2mat[-NonSplitFacs,]
if (lenM1==1L) {n2mat <- matrix(n2mat,nrow=1)}
if (ncol(newmat) < (LenP+1L)) {
numCol <- (LenP + 1L) - ncol(newmat)
newmat <- cbind(newmat,matrix(rep(0L,numCol*nrow(newmat)),ncol=numCol))
}
newmat <- rbind(newmat,n2mat); lenM <- lenM+lenM1; M <- c(M,M1)
if (class(newmat) == "matrix") {
if (nrow(newmat) > 0) {
PosAns <- SolutionSearch(newmat,M,MyNum,myPrimes)
}
}
}
}
}
EndTime <- Sys.time()
TotTime <- EndTime - BegTime
print(format(TotTime))
return(PosAns)
}
使用旧的QS算法
> library(gmp)
> library(Rmpfr)
> n3 <- prod(nextprime(urand.bigz(2, 40, 17)))
> system.time(t5 <- QuadSieveAll(n3,0.1,myps))
user system elapsed
164.72 0.77 165.63
> system.time(t6 <- factorize(n3))
user system elapsed
0.1 0.0 0.1
> all(t5[sort.list(asNumeric(t5))]==t6[sort.list(asNumeric(t6))])
[1] TRUE
采用新的多项式QS算法
> QuadSieveMultiPolysAll(n3)
[1] "4.952 secs"
Big Integer ('bigz') object of length 2:
[1] 342086446909 483830424611
> n4 <- prod(nextprime(urand.bigz(2,50,5)))
> QuadSieveMultiPolysAll(n4) ## With old algo, it took over 4 hours
[1] "1.131717 mins"
Big Integer ('bigz') object of length 2:
[1] 166543958545561 880194119571287
> n5 <- as.bigz("94968915845307373740134800567566911") ## 35 digits
> QuadSieveMultiPolysAll(n5)
[1] "3.813167 mins"
Big Integer ('bigz') object of length 2:
[1] 216366620575959221 438925910071081891
> system.time(factorize(n5)) ## It appears we are reaching the limits of factorize
user system elapsed
131.97 0.00 131.98
顺便提一下:上面的数字n5非常有趣。您可以在这里查看。
瓶颈出现了!!!
> n6 <- factorialZ(38) + 1L ## 45 digits
> QuadSieveMultiPolysAll(n6)
[1] "22.79092 mins"
Big Integer ('bigz') object of length 2:
[1] 14029308060317546154181 37280713718589679646221
> system.time(factorize(n6)) ## Shut it down after 2 days of running
最新成就(50位数字)
> n9 <- prod(nextprime(urand.bigz(2,82,42)))
> QuadSieveMultiPolysAll(n9)
[1] "12.9297 hours"
Big Integer ('bigz') object of length 2:
[1] 2128750292720207278230259 4721136619794898059404993
## Based off of some crude test, factorize(n9) would take more than a year.
需要注意的是,对于较小的数字,量子分解算法通常比波拉德-罗算法表现更差,随着数字的增大,量子分解算法的优势开始显现。
以下是我最新的R分解算法。它速度更快,并向rle函数致敬。
算法3(已更新)
library(gmp)
MyFactors <- function(MyN) {
myRle <- function (x1) {
n1 <- length(x1)
y1 <- x1[-1L] != x1[-n1]
i <- c(which(y1), n1)
list(lengths = diff(c(0L, i)), values = x1[i], uni = sum(y1)+1L)
}
if (MyN==1L) return(MyN)
else {
pfacs <- myRle(factorize(MyN))
unip <- pfacs$values
pv <- pfacs$lengths
n <- pfacs$uni
myf <- unip[1L]^(0L:pv[1L])
if (n > 1L) {
for (j in 2L:n) {
myf <- c(myf, do.call(c,lapply(unip[j]^(1L:pv[j]), function(x) x*myf)))
}
}
}
myf[order(asNumeric(myf))] ## 'order' is faster than 'sort.list'
}
以下是新的基准测试(正如Dirk Eddelbuettel所说这里,“无法反驳经验数据”):
情况1(大质因数)
set.seed(100)
myList <- lapply(1:10^3, function(x) sample(10^6, 10^5))
benchmark(SortList=lapply(myList, function(x) sort.list(x)),
OrderFun=lapply(myList, function(x) order(x)),
replications=3,
columns = c("test", "replications", "elapsed", "relative"))
test replications elapsed relative
2 OrderFun 3 59.41 1.000
1 SortList 3 61.52 1.036
## The times are limited by "gmp::factorize" and since it relies on
## pseudo-random numbers, the times can vary (i.e. one pseudo random
## number may lead to a factorization faster than others). With this
## in mind, any differences less than a half of second
## (or so) should be viewed as the same.
x <- pow.bigz(2,256)+1
system.time(z1 <- MyFactors(x))
user system elapsed
14.94 0.00 14.94
system.time(z2 <- all_divisors(x)) ## system.time(factorize(x))
user system elapsed ## user system elapsed
14.94 0.00 14.96 ## 14.94 0.00 14.94
all(z1==z2)
[1] TRUE
x <- as.bigz("12345678987654321321")
system.time(x1 <- MyFactors(x^2))
user system elapsed
20.66 0.02 20.71
system.time(x2 <- all_divisors(x^2)) ## system.time(factorize(x^2))
user system elapsed ## user system elapsed
20.69 0.00 20.69 ## 20.67 0.00 20.67
all(x1==x2)
[1] TRUE
案例2(较小的数字)
set.seed(199)
samp <- sample(10^9, 10^5)
benchmark(JosephDivs=sapply(samp, MyFactors),
DontasDivs=sapply(samp, all_divisors),
OldDontas=sapply(samp, Oldall_divisors),
replications=10,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
test replications elapsed relative
1 JosephDivs 10 470.31 1.000
2 DontasDivs 10 567.10 1.206 ## with vapply(..., USE.NAMES = FALSE)
3 OldDontas 10 626.19 1.331 ## with sapply
案例3(为了完整彻底)
set.seed(97)
samp <- sample(10^6, 10^4)
benchmark(JosephDivs=sapply(samp, MyFactors),
DontasDivs=sapply(samp, all_divisors),
CottonDivs=sapply(samp, get_all_factors),
ChaseDivs=sapply(samp, FUN),
replications=5,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
test replications elapsed relative
1 JosephDivs 5 22.68 1.000
2 DontasDivs 5 27.66 1.220
3 CottonDivs 5 126.66 5.585
4 ChaseDivs 5 554.25 24.438
@RichieCotton的算法是一个非常好的R实现。暴力方法只能让你走得更远,而且在处理大量数字时会失败。我提供了三种算法,以满足不同的需求。第一种(是我在1月15日发布的原始算法,稍作更新),是一种独立的分解算法,提供了一种组合方法,高效、准确,并且可以轻松地转换为其他语言。第二个算法更像是一个筛子,非常快速,当您需要快速地分解数千个数字时非常有用。第三个是一个简短但功能强大的独立算法(如上所述),对于小于2^70的任何数字都优于其他算法(我从原来的代码中删掉了几乎所有内容)。我从Richie Cotton使用plyr::count
函数的方式中汲取灵感(它启发我编写了自己的rle
函数,该函数具有与plyr::count
非常相似的返回值),George Dontas处理微不足道的情况(即if(n==1) return(1)
)的简洁方式,以及@Zelazny7提供的解决方案,回答了我关于bigz向量的问题。
算法1(原始版)
library(gmp)
factor2 <- function(MyN) {
if (MyN == 1) return(1L)
else {
max_p_div <- factorize(MyN)
prime_vec <- max_p_div <- max_p_div[sort.list(asNumeric(max_p_div))]
my_factors <- powers <- as.bigz(vector())
uni_p <- unique(prime_vec); maxp <- max(prime_vec)
for (i in 1:length(uni_p)) {
temp_size <- length(which(prime_vec == uni_p[i]))
powers <- c(powers, pow.bigz(uni_p[i], 1:temp_size))
}
my_factors <- c(as.bigz(1L), my_factors, powers)
temp_facs <- powers; r <- 2L
temp_facs2 <- max_p_div2 <- as.bigz(vector())
while (r <= length(uni_p)) {
for (i in 1:length(temp_facs)) {
a <- which(prime_vec > max_p_div[i])
temp <- mul.bigz(temp_facs[i], powers[a])
temp_facs2 <- c(temp_facs2, temp)
max_p_div2 <- c(max_p_div2, prime_vec[a])
}
my_sort <- sort.list(asNumeric(max_p_div2))
temp_facs <- temp_facs2[my_sort]
max_p_div <- max_p_div2[my_sort]
my_factors <- c(my_factors, temp_facs)
temp_facs2 <- max_p_div2 <- as.bigz(vector()); r <- r+1L
}
}
my_factors[sort.list(asNumeric(my_factors))]
}
算法2(筛法)
EfficientFactorList <- function(n) {
MyFactsList <- lapply(1:n, function(x) 1)
for (j in 2:n) {
for (r in seq.int(j, n, j)) {MyFactsList[[r]] <- c(MyFactsList[[r]], j)}
}; MyFactsList}
system.time(t1 <- EfficientFactorList(10^5))
user system elapsed
1.04 0.00 1.05
system.time(t2 <- sapply(1:10^5, MyFactors))
user system elapsed
39.21 0.00 39.23
system.time(t3 <- sapply(1:10^5, all_divisors))
user system elapsed
49.03 0.02 49.05
TheTest <- sapply(1:10^5, function(x) all(t2[[x]]==t3[[x]]) && all(asNumeric(t2[[x]])==t1[[x]]) && all(asNumeric(t3[[x]])==t1[[x]]))
all(TheTest)
[1] TRUE
最后的想法
@Dontas关于分解大数的原始评论让我思考,那么对于真正的大数...比2^200还要大的数怎么办。您将看到,在此页面上选择的任何算法都将需要很长时间,因为它们大多依赖于使用Pollard-Rho算法的gmp::factorize
。从这个问题中可以看出,该算法仅适用于小于2^70的数字。我目前正在开发自己的factorize算法,它将实现Quadratic Sieve,这应该会把所有这些算法提升到一个新的水平。
0.6-3
的numbers
包中,包含了一个非常实用的函数divisors
,可以获取一个数的所有因子。它可以满足大多数用户的需求,但如果你想要更快的速度或者你需要处理更大的数字,则需要另一种方法。我编写了两个新的包(部分受到这个问题的启发),其中包含了旨在解决此类问题的高度优化的函数。第一个是RcppAlgos
,另一个是RcppBigIntAlgos
(之前称为bigIntegerAlgos
)。
RcppAlgos
RcppAlgos
包含两个函数,用于获取小于2^53 - 1
的数的因子:divisorsRcpp
(一个矢量化函数,可快速获取许多数字的完整分解)和divisorsSieve
(快速生成范围内的完整分解)。首先,我们使用divisorsRcpp
对许多随机数进行因式分解:library(gmp) ## for all_divisors by @GeorgeDontas
library(RcppAlgos)
library(numbers)
options(scipen = 999)
set.seed(42)
testSamp <- sample(10^10, 10)
## vectorized so you can pass the entire vector as an argument
testRcpp <- divisorsRcpp(testSamp)
testDontas <- lapply(testSamp, all_divisors)
identical(lapply(testDontas, as.numeric), testRcpp)
[1] TRUE
现在,使用 divisorsSieve
在一个范围内因式分解多个数字:
system.time(testSieve <- divisorsSieve(10^13, 10^13 + 10^5))
user system elapsed
0.242 0.006 0.247
system.time(testDontasSieve <- lapply((10^13):(10^13 + 10^5), all_divisors))
user system elapsed
47.880 0.132 47.922
identical(lapply(testDontasSieve, asNumeric), testSieve)
[1] TRUE
divisorsRcpp
和divisorsSieve
都是很好的函数,具有灵活性和高效性,但是它们的限制是2^53 - 1
。
RcppBigIntAlgos
RcppBigIntAlgos
包(版本0.2.0之前称为bigIntegerAlgos
)直接链接到C库gmp,并具有专为非常大的数字设计的divisorsBig
功能。
library(RcppBigIntAlgos)
## testSamp is defined above... N.B. divisorsBig is not quite as
## efficient as divisorsRcpp. This is so because divisorsRcpp
## can take advantage of more efficient data types.
testBig <- divisorsBig(testSamp)
identical(testDontas, testBig)
[1] TRUE
MyFactors
已被替换为 divisorsRcpp
和 divisorsBig
)。## Case 2
library(rbenchmark)
set.seed(199)
samp <- sample(10^9, 10^5)
benchmark(RcppAlgos=divisorsRcpp(samp),
RcppBigIntAlgos=divisorsBig(samp),
DontasDivs=lapply(samp, all_divisors),
replications=10,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
test replications elapsed relative
1 RcppAlgos 10 5.236 1.000
2 RcppBigIntAlgos 10 12.846 2.453
3 DontasDivs 10 383.742 73.289
## Case 3
set.seed(97)
samp <- sample(10^6, 10^4)
benchmark(RcppAlgos=divisorsRcpp(samp),
RcppBigIntAlgos=divisorsBig(samp),
numbers=lapply(samp, divisors), ## From the numbers package
DontasDivs=lapply(samp, all_divisors),
CottonDivs=lapply(samp, get_all_factors),
ChaseDivs=lapply(samp, FUN),
replications=5,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
test replications elapsed relative
1 RcppAlgos 5 0.083 1.000
2 RcppBigIntAlgos 5 0.265 3.193
3 numbers 5 12.913 155.578
4 DontasDivs 5 15.813 190.518
5 CottonDivs 5 60.745 731.867
6 ChaseDivs 5 299.520 3608.675
divisorsBig
函数底层算法的真正实力。被分解的数字是10的幂,因此质因数分解步骤几乎可以完全忽略(例如在我的机器上,system.time(factorize(pow.bigz(10,30)))
记录0
)。因此,定时差异仅取决于如何快速地组合素因子以产生所有因子。library(microbenchmark)
powTen <- pow.bigz(10, 30)
microbenchmark(divisorsBig(powTen), all_divisors(powTen), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
divisorsBig(powTen) 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
all_divisors(powTen) 21.49849 21.27973 21.13085 20.63345 21.18834 20.38772 100
## Negative numbers show an even greater increase in efficiency
negPowTen <- powTen * -1
microbenchmark(divisorsBig(negPowTen), all_divisors(negPowTen), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
divisorsBig(negPowTen) 1.00000 1.0000 1.0000 1.00000 1.00000 1.00000 100
all_divisors(negPowTen) 28.75275 28.1864 27.9335 27.57434 27.91376 30.16962 100
使用 divisorsBig
,对于非常大的输入,获得完整的质因数分解是没有问题的。该算法根据输入动态调整,并在不同情况下应用不同的算法。如果使用Lenstra的椭圆曲线方法或二次筛法,我们还可以利用多线程。
以下是使用此答案中定义的n5
和n9
的一些示例。
n5 <- as.bigz("94968915845307373740134800567566911")
system.time(print(divisorsBig(n5)))
Big Integer ('bigz') object of length 4:
[1] 1 216366620575959221 438925910071081891
[4] 94968915845307373740134800567566911
user system elapsed
0.162 0.003 0.164
n9 <- prod(nextprime(urand.bigz(2, 82, 42)))
system.time(print(divisorsBig(n9, nThreads=4)))
Big Integer ('bigz') object of length 4:
[1] 1
[2] 2128750292720207278230259
[3] 4721136619794898059404993
[4] 10050120961360479179164300841596861740399588283187
user system elapsed
1.776 0.011 0.757
这里提供了一个由@Dontas提供的示例,其中包含一个大质数和一个较小的质数:
x <- pow.bigz(2, 256) + 1
divisorsBig(x, showStats=TRUE, nThreads=8)
Summary Statistics for Factoring:
115792089237316195423570985008687907853269984665640564039457584007913129639937
| Pollard Rho Time |
|--------------------|
| 479ms |
| Lenstra ECM Time | Number of Curves |
|--------------------|--------------------|
| 1s 870ms | 2584 |
| Total Time |
|--------------------|
| 2s 402ms |
Big Integer ('bigz') object of length 4:
[1] 1
[2] 1238926361552897
[3] 93461639715357977769163558199606896584051237541638188580280321
[4] 115792089237316195423570985008687907853269984665640564039457584007913129639937
与使用 gmp::factorize
查找质因数分解相比,可以进行比较:
system.time(factorize(x))
user system elapsed
9.199 0.036 9.248
## https://members.loria.fr/PZimmermann/records/rsa.html
rsa79 <- as.bigz("7293469445285646172092483905177589838606665884410340391954917800303813280275279")
divisorsBig(rsa79, nThreads=8, showStats=TRUE, skipPolRho=T, skipECM=T)
Summary Statistics for Factoring:
7293469445285646172092483905177589838606665884410340391954917800303813280275279
| MPQS Time | Complete | Polynomials | Smooths | Partials |
|--------------------|----------|-------------|------------|------------|
| 2m 49s 174ms | 100 | 91221 | 5651 | 7096 |
| Mat Algebra Time | Mat Dimension |
|--------------------|--------------------|
| 14s 863ms | 12625 x 12747 |
| Total Time |
|--------------------|
| 3m 4s 754ms |
Big Integer ('bigz') object of length 4:
[1] 1
[2] 848184382919488993608481009313734808977
[3] 8598919753958678882400042972133646037727
[4] 7293469445285646172092483905177589838606665884410340391954917800303813280275279
# TEST
# x <- as.bigz("12345678987654321")
# all_divisors(x)
# all_divisors(x*x)
# x <- pow.bigz(2,89)-1
# all_divisors(x)
library(gmp)
options(scipen =30)
sort_listz <- function(z) {
#==========================
z <- z[order(as.numeric(z))] # sort(z)
} # function sort_listz
mult_listz <- function(x,y) {
do.call('c', lapply(y, function(i) i*x))
}
all_divisors <- function(x) {
#==========================
if (abs(x)<=1) return(x)
else {
factorsz <- as.bigz(factorize(as.bigz(x))) # factorize returns up to
# e.g. x= 12345678987654321 factors: 3 3 3 3 37 37 333667 333667
factorsz <- sort_listz(factorsz) # vector of primes, sorted
prime_factorsz <- unique(factorsz)
#prime_ekt <- sapply(prime_factorsz, function(i) length( factorsz [factorsz==i]))
prime_ekt <- vapply(prime_factorsz, function(i) sum(factorsz==i), integer(1), USE.NAMES=FALSE)
spz <- vector() # keep all divisors
all <-1
n <- length(prime_factorsz)
for (i in 1:n) {
pr <- prime_factorsz[i]
pe <- prime_ekt[i]
all <- all*(pe+1) #counts all divisors
prz <- as.bigz(pr)
pse <- vector(mode="raw",length=pe+1)
pse <- c( as.bigz(1), prz)
if (pe>1) {
for (k in 2:pe) {
prz <- prz*pr
pse[k+1] <- prz
} # for k
} # if pe>1
if (i>1) {
spz <- mult_listz (spz, pse)
} else {
spz <- pse;
} # if i>1
} #for n
spz <- sort_listz (spz)
return (spz)
}
} # function factors_all_divisors
#====================================
经过精细优化的版本,非常快速。代码仍然简单、易读和清晰。
测试
#Test 4 (big prime factor)
x <- pow.bigz(2,256)+1 # = 1238926361552897 * 93461639715357977769163558199606896584051237541638188580280321
system.time(z2 <- all_divisors(x))
# user system elapsed
# 19.27 1.27 20.56
#Test 5 (big prime factor)
x <- as.bigz("12345678987654321321") # = 3 * 19 * 216590859432531953
system.time(x2 <- all_divisors(x^2))
#user system elapsed
#25.65 0.00 25.67
mult_listz <- function(x,y) { z <- x; for (j in 2:length(y)) { # y[1] == 1 z <- c(z,y[j] * x) } # for j return(z) }
,使用 mult_listz <- function(x,y) { z <- do.call('c', lapply(y, function(i) i*x)) return (z) }
。这将提高它的性能!(受Joseph代码的启发)。 - user5821909prime_ekt <- vapply(prime_factorz, function(i) sum(factorz==i), integer(1), USE.NAMES=FALSE)
。它可以使你的代码运行得更快一些。 - Joseph WoodprimeFactors <- function(n) {
res <- c()
k <- 2
repeat {
if (n %% k == 0) {
res <- append(res, k)
n <- n / k
} else {
k <- k + 1 + (k > 2)
}
if (n == 1) {
return(res)
}
}
}
allFactors <- function(n) {
v <- trunc(c(-sqrt(n):-1, 1:sqrt(n)))
f <- v[n %% v == 0]
unique(sort(c(f, n / f)))
}
> n <- 6
> primeFactors(n)
[1] 2 3
> allFactors(n)
[1] -6 -3 -2 -1 1 2 3 6
并且
> n <- 2459745082
> primeFactors(n)
[1] 2 43 1123 25469
> allFactors(n)
[1] -2459745082 -1229872541 -57203374 -28601687 -2190334 -1095167
[7] -96578 -50938 -48289 -25469 -2246 -1123
[13] -86 -43 -2 -1 1 2
[19] 43 86 1123 2246 25469 48289
[25] 50938 96578 1095167 2190334 28601687 57203374
[31] 1229872541 2459745082
allFactors
存在问题,尤其是这一行:-sqrt(n):sqrt(n)
。尝试使用 n = 6
进行测试。 - Joseph Woods <- as.integer(sqrt(n)); v <- -s:s
- Joseph Woodtrunc(c(-sqrt(n):-1, 1:sqrt(n)))
。 - ThomasIsCoding
div <- seq_len(x); x[x %% div == 0]
的代码,在x
为你所关心的整数时应该可以实现,对吧? - Chasediv[x %% div == 0]
。 - Ramnath