新答案
我编写了一个名为rollRegres
的软件包,可以更快地完成此操作。它比Gavin Simpson的答案快约58倍。以下是一个例子:
set.seed(101)
n <- 10000
wdth <- 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
beta <- fit$coefficients
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
library(rollRegres)
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"
)$one_step_forecasts)
library(compiler)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
newnew = roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
旧答案
您可以通过更新分解来减少运行时间。这样,每次迭代的成本为
,而不是
,其中n是您窗口宽度。以下是一个比较两者的代码。用C++实现可能会更快,但是LINPACK的dchud
和dchdd
并没有包含在R中,所以您需要编写一个软件包来执行操作。此外,我记得读过您可以通过其他实现比LINPACK的dchud
和dchdd
更快地更新R。
library(SamplerCompare)
roll_forcast <- function(X, y, width){
n <- nrow(X)
p <- ncol(X)
out <- rep(NA_real_, n)
is_first <- TRUE
i <- width
while(i < n){
if(is_first){
is_first <- FALSE
qr. <- qr(X[1:width, ])
R <- qr.R(qr.)
X <- t(X)
XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
} else {
x_new <- X[, i]
x_old <- X[, i - width]
R <- .Fortran(
"dchud", R, p, p, x_new, 0., 0L, 0L,
0., 0., numeric(p), numeric(p),
PACKAGE = "SamplerCompare")[[1]]
R <- .Fortran(
"dchdd", R, p, p, x_old, 0., 0L, 0L,
0., 0., numeric(p), numeric(p), integer(1),
PACKAGE = "SamplerCompare")[[1]]
XtY <- XtY + y[i] * x_new - y[i - width] * x_old
}
coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))
i <- i + 1
out[i] <- X[, i] %*% coef.
}
out
}
set.seed(101)
n <- 10000
wdth = 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
beta <- fit$coefficients
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_forcast(X, y, wdth))
library(compiler)
roll_forcast <- cmpfun(roll_forcast)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
new = roll_forcast(X, y, wdth),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
lm.fit()
建议的代码。做这件事比我想象的要复杂一些。 - Gavin Simpsonglm
或者glmnet
,我能否实现类似的方法呢?还是你的方法只针对线性回归进行了优化? - Zachglm.fit
但你需要像我上面对lm.fit
所做的那样直接调用它。唯一的方法是使用调试器逐步执行glm()
调用所做的操作,并查看您需要为glm.fit
提供相关对象的步骤。至于glmnet()
,快速浏览表明您可以调用在glmnet()
中使用的函数,但这取决于您想要哪个 family。因此,是的,这是泛化的,但您只需要确定所需的快速拟合代码,然后安排调用它。 - Gavin Simpson