我正在尝试整理一些关于在R中使用for
循环增长数据结构的危险的材料。我想要能够解释驱动性能差异的内部机制,特别是方法之间巨大的内存差异。
我对比了三种方法:
- 通过使用
c()
函数增加结果向量。 - 通过赋值增加结果向量。
- 预分配结果向量。
考虑以下示例:
library(pryr)
x <- runif(10, min = 1, max = 100)
# Create function that appends to result vector through c
for_loop_c <- function(x, print = TRUE) {
y <- NULL
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Create function that appends to result vector through assignment
for_loop_assign <- function(x, print = TRUE) {
y <- NULL
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Create function that preallocates result vector
for_loop_preallocate <- function(x, print = TRUE) {
y <- numeric(length(x))
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Run functions and check for copies by changes to address and refs
for_loop_c(x)
#> [1] "0x11bfbdbf8" "1"
#> [1] "0x11bf9b948" "1"
#> [1] "0x11bf9f398" "1"
#> [1] "0x11bf9f258" "1"
#> [1] "0x11bf82938" "1"
#> [1] "0x11bf82778" "1"
#> [1] "0x11bf825b8" "1"
#> [1] "0x11bf823f8" "1"
#> [1] "0x11bf55768" "1"
#> [1] "0x11bf55608" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
for_loop_assign(x)
#> [1] "0x11c2ee4e8" "1"
#> [1] "0x11c2bb608" "1"
#> [1] "0x11c2b6c28" "1"
#> [1] "0x11c2b6ae8" "1"
#> [1] "0x11c224d48" "1"
#> [1] "0x11c224b88" "1"
#> [1] "0x11c2249c8" "1"
#> [1] "0x11c224808" "1"
#> [1] "0x11c2d3748" "1"
#> [1] "0x11c2d35e8" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
for_loop_preallocate(x)
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
# Create a bigger example x for benchmarking
x <- runif(10000, min = 1, max = 100)
# Benchmark
bench::mark(
for_loop_c(x, print = FALSE),
for_loop_assign(x, print = FALSE),
for_loop_preallocate(x, print = FALSE)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 for_loop_c(x, print = FALSE) 106ms 114.92ms 8.57 381.96MB
#> 2 for_loop_assign(x, print = FALSE) 1.19ms 1.27ms 621. 1.66MB
#> 3 for_loop_preallocate(x, print = FALSE) 381.71µs 386.88µs 2554. 78.17KB
#> # … with 1 more variable: `gc/sec` <dbl>
library(profmem)
gc()
#> used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
#> Ncells 824931 44.1 1409852 75.3 NA 1409852 75.3
#> Vcells 1483448 11.4 8388608 64.0 32768 8388585 64.0
pm1 <- profmem({
y <- NULL
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
}
})
pm2 <- profmem({
y <- NULL
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
}
y
})
# Number of times memory allocation occurred
pm1$bytes |> length()
#> [1] 10061
pm2$bytes |> length()
#> [1] 174
使用 reprex v2.0.2 于2023年02月02日创建
会话信息:sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Monterey 12.3.1
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Athens
#> date 2023-02-02
#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> ! package * version date (UTC) lib source
#> bench 1.1.2 2021-11-30 [1] CRAN (R 4.2.0)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> P digest 0.6.29 2021-12-01 [?] CRAN (R 4.2.0)
#> P evaluate 0.16 2022-08-09 [?] CRAN (R 4.2.1)
#> fansi 1.0.3 2022-03-24 [2] CRAN (R 4.2.0)
#> P fastmap 1.1.0 2021-01-25 [?] CRAN (R 4.2.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0)
#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.2.0)
#> P highr 0.9 2021-04-16 [?] CRAN (R 4.2.1)
#> P htmltools 0.5.3 2022-07-18 [?] CRAN (R 4.2.0)
#> P knitr 1.40 2022-08-24 [?] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.2.0)
#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.2.0)
#> pillar 1.8.1 2022-08-19 [2] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.0)
#> P profmem * 0.6.0 2020-12-13 [?] CRAN (R 4.2.0)
#> pryr * 0.1.6 2023-01-17 [1] CRAN (R 4.2.0)
#> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> P R.cache 0.16.0 2022-07-21 [?] CRAN (R 4.2.0)
#> P R.methodsS3 1.8.2 2022-06-13 [?] CRAN (R 4.2.0)
#> P R.oo 1.25.0 2022-06-12 [?] CRAN (R 4.2.0)
#> P R.utils 2.12.2 2022-11-11 [?] CRAN (R 4.2.0)
#> Rcpp 1.0.9 2022-07-08 [2] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [2] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> P rmarkdown 2.16 2022-08-24 [?] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [2] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.0)
#> P stringi 1.7.8 2022-07-11 [?] CRAN (R 4.2.0)
#> P stringr 1.4.1 2022-08-20 [?] CRAN (R 4.2.0)
#> P styler 1.9.0 2023-01-15 [?] CRAN (R 4.2.0)
#> tibble 3.1.8 2022-07-22 [2] CRAN (R 4.2.0)
#> utf8 1.2.2 2021-07-24 [2] CRAN (R 4.2.0)
#> P vctrs 0.5.1 2022-11-16 [?] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [2] CRAN (R 4.2.0)
#> P xfun 0.33 2022-09-12 [?] CRAN (R 4.2.1)
#> P yaml 2.3.5 2022-02-21 [?] CRAN (R 4.2.0)
#>
#> [1] /*/renv/library/R-4.2/aarch64-apple-darwin20
#> [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#>
#> P ── Loaded and on-disk path mismatch.
#>
#> ──────────────────────────────────────────────────────────────────────────────
我理解为什么预分配是最有效的(没有复制,每次迭代都使用相同的地址)。
我认为在c()
中发生的情况是,在函数内部完全复制了y
,然后在将其重新分配给y
时又进行了一次复制,而在使用赋值进行增长时,会进行复制(因此地址会发生变化),但只在赋值期间进行复制?
我的问题是:
- 我的一般理解正确吗?
- 在方法1和2之间发生的复制及其大小方面,可以解释内存使用量和内存分配事件数量的巨大差异是什么?
- 有没有好的方法来演示方法1和2之间到底发生了什么?
编辑
鉴于@Kevin-Ushey和@alexis_laz的反馈,我改编了我的示例以记录每次迭代时累积的地址更改次数:
library(pryr)
library(ggplot2)
# Create function that appends to result vector through c
# Collect cumulative number of address changes per iteration
for_loop_c <- function(x, count_addr = TRUE) {
y <- NULL
y_addr <- address(y)
cum_address_n <- 0
cum_address_n_v <- numeric(length(x))
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
if (address(y) != y_addr) {
cum_address_n <- cum_address_n + 1
y_addr <- address(y)
}
cum_address_n_v[i] <- cum_address_n
}
data.frame(i = seq_along(cum_address_n_v),
cum_address_n = cum_address_n_v,
mode = "c")
}
# Create function that appends to result vector through assignment.
# Collect cumulative number of address changes per iteration
for_loop_assign <- function(x) {
y <- NULL
y_addr <- address(y)
cum_address_n <- 0
cum_address_n_v <- numeric(length(x))
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (address(y) != y_addr) {
cum_address_n <- cum_address_n + 1
y_addr <- address(y)
}
cum_address_n_v[i] <- cum_address_n
}
data.frame(i = seq_along(cum_address_n_v),
cum_address_n = cum_address_n_v,
mode = "assign")
}
x <- runif(10000, min = 1, max = 100)
rbind(for_loop_c(x), for_loop_assign(x)) |>
ggplot(aes(x = i, y = cum_address_n, colour = mode)) +
geom_line()
使用 reprex v2.0.2 于2023年2月3日创建
会话信息sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Monterey 12.3.1
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Athens
#> date 2023-02-03
#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> ! package * version date (UTC) lib source
#> P assertthat 0.2.1 2019-03-21 [?] CRAN (R 4.2.0)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> P colorspace 2.0-3 2022-02-21 [?] CRAN (R 4.2.1)
#> curl 4.3.2 2021-06-23 [2] CRAN (R 4.2.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
#> P digest 0.6.29 2021-12-01 [?] CRAN (R 4.2.0)
#> dplyr 1.0.10 2022-09-01 [2] CRAN (R 4.2.0)
#> P evaluate 0.16 2022-08-09 [?] CRAN (R 4.2.1)
#> fansi 1.0.3 2022-03-24 [2] CRAN (R 4.2.0)
#> P farver 2.1.1 2022-07-06 [?] CRAN (R 4.2.1)
#> P fastmap 1.1.0 2021-01-25 [?] CRAN (R 4.2.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0)
#> generics 0.1.3 2022-07-05 [2] CRAN (R 4.2.0)
#> P ggplot2 * 3.4.0 2022-11-04 [?] CRAN (R 4.2.0)
#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.2.0)
#> P gtable 0.3.1 2022-09-01 [?] CRAN (R 4.2.1)
#> P highr 0.9 2021-04-16 [?] CRAN (R 4.2.1)
#> P htmltools 0.5.3 2022-07-18 [?] CRAN (R 4.2.0)
#> httr 1.4.4 2022-08-17 [2] CRAN (R 4.2.0)
#> P knitr 1.40 2022-08-24 [?] CRAN (R 4.2.0)
#> P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.2.1)
#> lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.2.0)
#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.2.0)
#> mime 0.12 2021-09-28 [2] CRAN (R 4.2.0)
#> P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.2.1)
#> pillar 1.8.1 2022-08-19 [2] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.0)
#> pryr * 0.1.6 2023-01-17 [1] CRAN (R 4.2.0)
#> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> P R.cache 0.16.0 2022-07-21 [?] CRAN (R 4.2.0)
#> P R.methodsS3 1.8.2 2022-06-13 [?] CRAN (R 4.2.0)
#> P R.oo 1.25.0 2022-06-12 [?] CRAN (R 4.2.0)
#> P R.utils 2.12.2 2022-11-11 [?] CRAN (R 4.2.0)
#> P R6 2.5.1 2021-08-19 [?] CRAN (R 4.2.0)
#> Rcpp 1.0.9 2022-07-08 [2] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [2] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> P rmarkdown 2.16 2022-08-24 [?] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [2] CRAN (R 4.2.0)
#> P scales 1.2.1 2022-08-20 [?] CRAN (R 4.2.1)
#> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.0)
#> P stringi 1.7.8 2022-07-11 [?] CRAN (R 4.2.0)
#> P stringr 1.4.1 2022-08-20 [?] CRAN (R 4.2.0)
#> P styler 1.9.0 2023-01-15 [?] CRAN (R 4.2.0)
#> tibble 3.1.8 2022-07-22 [2] CRAN (R 4.2.0)
#> P tidyselect 1.2.0 2022-10-10 [?] CRAN (R 4.2.0)
#> utf8 1.2.2 2021-07-24 [2] CRAN (R 4.2.0)
#> P vctrs 0.5.1 2022-11-16 [?] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [2] CRAN (R 4.2.0)
#> P xfun 0.33 2022-09-12 [?] CRAN (R 4.2.1)
#> xml2 1.3.3 2021-11-30 [2] CRAN (R 4.2.0)
#> P yaml 2.3.5 2022-02-21 [?] CRAN (R 4.2.0)
#>
#> [1] /*/optimise-r/renv/library/R-4.2/aarch64-apple-darwin20
#> [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#>
#> P ── Loaded and on-disk path mismatch.
#>
#> ──────────────────────────────────────────────────────────────────────────────
根据答案和评论的反馈,我的理解是:
- 使用赋值会导致地址变化较少,当
y
的大小增加到不再通过R的小向量池进行管理,而是通过请求操作系统获取额外内存时,这种变化就会逐渐减少。我认为这意味着,在处理更大的向量时,当使用赋值在请求额外内存之间修改对象时,R可以直接在原地修改对象,并且由于每次迭代中的修改非常小,算法可以运行多次迭代而不需要请求额外内存。 - 使用
c()
会在每次迭代时触发地址变化。然而,我仍然不清楚这是因为c()
在内部修改了y
并因此触发了复制,还是因为将完整的新y
分配回y
而不是分配单个附加元素?
[<-
(子赋值)超出向量的长度时,R - 在适当的情况下 - 通过额外的5%扩大向量(而“truelength”无法通过length
访问),因此R有一个隐藏的5%更多内存预先占用以继续在原地增长向量。使用c
时,每次都会分配一个新的向量(适当长度)并将c
参数复制到新向量中。将使用c
创建的对象绑定到符号(这里是“y”)不会创建任何副本(除非该对象也被引用到另一个符号或正在被修改)。 - alexis_laz