如果您有足够的内存来存储多个数据实例,使用
doParallel
包的这种方法可能有效。在这里,我使用
tidyverse
家族。
library(tidyverse)
library(doParallel)
n <- 1000
big_table <- do.call("rbind", replicate(n, data, simplify = FALSE))
lapply(1:10, function(x) {big_table <<- bind_cols(big_table, big_table); return(x)})
col_list <- names(big_table)
number_of_parallel_cores <- 4
col_group <- split(col_list, sort(rep_len(1:number_of_parallel_cores, length(col_list))))
system.time({
registerDoParallel(number_of_parallel_cores)
combine_data <- bind_rows(foreach(i_col_group = col_group) %dopar% {
big_table %>%
select(one_of(i_col_group)) %>%
summarize(across(.fns = paste, collapse = "")) %>%
pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
})
})
时间控制
user system elapsed
1.291 0.291 0.898
输出
col_names values
<chr> <chr>
1 Col1...1 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
2 Col2...2 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
3 Col3...3 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
4 Col1...4 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
5 Col2...5 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
6 Col3...6 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
7 Col1...7 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
8 Col2...8 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
9 Col3...9 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
10 Col1...10 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
# … with 3,062 more rows
然而,我发现仅仅将所有内容绑定在一起比并行设置更快。猜测这种操作的开销并不真正划算。
system.time(
big_table %>%
select(one_of(col_list)) %>%
summarize(across(.fns = paste, collapse = "")) %>%
pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
)
user system elapsed
0.021 0.000 0.022