Greg提出的使用biglm
的解决方案比LyzandeR建议的使用lm
的解决方案更快,但仍然相当慢。使用下面我展示的函数可以避免很多开销。如果你使用Rcpp
全部使用C++,那么速度会大大提升。
set.seed(101)
n <- 1000
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(10 + X %*% runif(10)) + rnorm(n)
dat <- data.frame(y = y, X)
biglm_wrapper <- function(formula, data, min_window_size){
mf <- model.frame(formula, data)
X <- model.matrix(terms(mf), mf)
y - model.response(mf)
n <- nrow(X)
p <- ncol(X)
storage.mode(X) <- "double"
storage.mode(y) <- "double"
w <- 1
qr <- list(
d = numeric(p), rbar = numeric(choose(p, 2)),
thetab = numeric(p), sserr = 0, checked = FALSE, tol = numeric(p))
nrbar = length(qr$rbar)
beta. <- numeric(p)
out <- NULL
for(i in 1:n){
row <- X[i, ]
qr[c("d", "rbar", "thetab", "sserr")] <- .Fortran(
"INCLUD", np = p, nrbar = nrbar, weight = w, xrow = row, yelem = y[i],
d = qr$d, rbar = qr$rbar, thetab = qr$thetab, sserr = qr$sserr, ier = i - 0L,
PACKAGE = "biglm")[
c("d", "rbar", "thetab", "sserr")]
if(i >= min_window_size){
tmp <- .Fortran(
"REGCF", np = p, nrbar = nrbar, d = qr$d, rbar = qr$rbar,
thetab = qr$thetab, tol = qr$tol, beta = beta., nreq = p, ier = i,
PACKAGE = "biglm")
Coef <- tmp$beta
R <- diag(p)
R[row(R) > col(R)] <- qr$rbar
R <- t(R)
R <- sqrt(qr$d) * R
ok <- qr$d != 0
R[ok, ok] <- chol2inv(R[ok, ok, drop = FALSE])
R[!ok, ] <- NA
R[ ,!ok] <- NA
out <- c(out, list(cbind(
coef = Coef,
SE = sqrt(diag(R * qr$sserr / (i - p + sum(!ok)))))))
}
}
out
}
library(biglm)
f2 <- function(formula, data, min_window_size){
fit1 <- biglm(formula, data = data[1:min_window_size, ])
data.split <-
split(data, c(rep(NA,min_window_size),1:(nrow(data) - min_window_size)))
out4 <- Reduce(update, data.split, init=fit1, accumulate=TRUE)
lapply(out4, function(x) summary(x)$mat[, c("Coef", "SE")])
}
frm <- y ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10
r1 <- biglm_wrapper(frm, dat, 25)
r2 <- f2(frm, dat, 25)
all.equal(r1, r2, check.attributes = FALSE)
microbenchmark::microbenchmark(
r1 = biglm_wrapper(frm, dat, 25),
r2 = f2(frm, dat, 25),
r3 = lapply(
25:nrow(dat), function(x) lm(frm, data = dat[1:x , ])),
times = 5)
all_slopes<-unlist(sapply(1:31,function(j) rolling_lms[[j]]$coefficients[2])
。 - Carl Witthoft