尽管我的评论早就发表了,克里夫仍然没有更正他的答案,因此没有正确使用向量化参数在
frollmean
中提交。
DT[, paste0("r_", vars) := frollmean(.SD, n=w, align=align), by = g, .SDcols = vars]
一个基准
library(data.table)
set.seed(42)
n = 1e6
DT = data.table(V1 = rnorm(n), V2 = rlnorm(n), V3 = runif(n), g = rep(c("a","b"),n/2), key="g")
vars = c("V1","V2","V3")
library(zoo)
w = 48
align = "left"
roland = function(DT) DT[, paste0("r_", vars) := lapply(.SD, rollmean, k=w, na.pad=TRUE, align=align), by = g, .SDcols = vars]
cliff = function(DT) DT[, paste0("r_", vars) := lapply(.SD, frollmean, n=w, fill=NA, align=align), by = g, .SDcols = vars]
jan = function(DT) DT[, paste0("r_", vars) := frollmean(.SD, n=w, align=align), by = g, .SDcols = vars]
DT1 = copy(DT)
DT2 = copy(DT)
DT3 = copy(DT)
system.time(a1 <- roland(DT1))
system.time(a2 <- cliff(DT2))
system.time(a3 <- jan(DT3))
all.equal(DT1, DT2)
all.equal(DT1, DT3)
最重要的是,变量越多,计算速度越快,因为它们可以并行计算。
如果我们扩大数据集并从基准测试中去除`.data.table`的开销,这一点很容易观察到。
library(data.table)
set.seed(42)
n = 1e7
nth = 8
DT = as.data.table(replicate(nth, rnorm(n)))
setDTthreads(nth)
w = 48
align = "left"
DT2 = copy(DT)
DT3 = copy(DT)
system.time(a2 <- lapply(DT2, frollmean, n=w, align=align))
system.time(a3 <- frollmean(DT2, w, align=align))
数据
的例子。 - akrun