我正在进行股票回报的多窗口范围计算(即最大值和最小值)。 我已经使用dplyr创建了我的版本,但是许多人发布数据表格计算速度更快的基准测试。 我已经创建了使用data.table语法的版本,但速度比dplyr慢。 有人能帮我找到更好地利用data.table加快速度的方法吗?非常感谢。
library(Quandl)
library(tidyr)
library(dplyr)
library(data.table)
library(microbenchmark)
tickers <- c("GOOG/NASDAQ_AAPL", "GOOG/NASDAQ_MSFT",
"GOOG/NYSE_IBM", "GOOG/NASDAQ_GOOG")
data <- Quandl(tickers,transformation = "rdiff")
returns <- gather(data, stock, value, -Date) %>%
separate(stock, c("name", "field"), " - ") %>%
filter(
field == "Close"
) %>%
select(
- field
)
returns_dt <- data.table(returns)
multi_window_range <- function(data) {
result_1y <- data %>%
filter(
Date >= Sys.Date() - 365
) %>%
group_by(name) %>%
summarise(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
)
result_2y <- data %>%
filter(
Date >= Sys.Date() - 365 * 2
) %>%
group_by(name) %>%
summarise(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
)
result_5y <- data %>%
filter(
Date >= Sys.Date() - 365 * 5
) %>%
group_by(name) %>%
summarise(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
)
return(inner_join(inner_join(result_1y, result_2y, by = "name"), result_5y, by = "name"))
}
multi_window_range_dt <- function(data) {
setkey(data, name)
result_1y <- data[Date >= Sys.Date() - 365,
list(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
), by = "name"]
result_2y <- data[Date >= Sys.Date() - 365 * 2,
list(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
), by = "name"]
result_5y <- data[Date >= Sys.Date() - 365 * 5,
list(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
), by = "name"]
return(result_1y[result_2y][result_5y])
}
microbenchmark(
multi_window_range(returns),
multi_window_range_dt(returns_dt)
)
Unit: milliseconds
expr min lq mean median uq max neval
multi_window_range(returns) 6.341532 6.522303 6.915266 6.692666 6.922623 10.16709 100
multi_window_range_dt(returns_dt) 7.537073 7.738516 8.066579 7.865968 8.073114 12.68021 100
by="name"
拆分一次而不是三次。公平的比较应该与dplyr
仅分组一次相同。 - Daniel Kriziansummarise
不接受range
,而do
也不是很快。 - kismsudata %>% filter(Date > Sys.Date() - 5*365, !is.na(value)) %>% group_by(name) %>% summarize(max_1y = max(value[Date > Sys.Date() - 365]), min_1y = min(value[Date > Sys.Date() - 365]), max_2y = max(value[Date > Sys.Date() - 2*365]), min_2y = min(value[Date > Sys.Date() - 2*365]), max_5y = max(value), min_5y = min(value))
- G. Grothendieck