R中将矩阵转换为长格式数据框的最快方法

4

我希望找到将矩阵转换为长格式数据框的最快方法。

这里我比较了三种解决方案,但我想知道是否存在更快的方法,例如使用data.table

以下是我比较的三种方法的可重现代码:

# Generate matrix -------------------------------------------------------------
set.seed(1)
ex <- matrix(data = round(runif(100000), 1), nrow = 1000, ncol = 100)
rownames(ex) <- paste0("row", 1:nrow(ex))
colnames(ex) <- paste0("col", 1:ncol(ex))
ex[1:5, 1:5]

     col1 col2 col3 col4 col5
row1  0.3  0.5  0.9  0.8  0.2
row2  0.4  0.7  1.0  0.5  0.5
row3  0.6  0.4  0.9  0.2  0.0
row4  0.9  1.0  0.4  0.4  0.5
row5  0.2  0.1  0.2  0.8  0.9

# table solution --------------------------------------------------------------
df1 <- as.data.frame(as.table(ex))

# reshape solution ------------------------------------------------------------
df2 <- reshape2::melt(ex)

# dplyr solution --------------------------------------------------------------
library(dplyr)
library(tidyr)
df3 <- ex %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Var1") %>%
  gather("Var2", "value", -Var1)

# check for equality ----------------------------------------------------------
colnames(df1)[colnames(df1) == "Freq"] <- "value"
head(df1)
  Var1 Var2 value
1 row1 col1   0.3
2 row2 col1   0.4
3 row3 col1   0.6
4 row4 col1   0.9
5 row5 col1   0.2
6 row6 col1   0.9

df1$Var1 <- as.character(df1$Var1)
df1$Var2 <- as.character(df1$Var2)
df2$Var1 <- as.character(df2$Var1)
df2$Var2 <- as.character(df2$Var2)

identical(df1, df2); identical(df1, df3)
TRUE

# Microbenchmark --------------------------------------------------------------
library(microbenchmark)
comp <- microbenchmark(
  table = {
    df1 <- as.data.frame(as.table(ex))
  },

  reshape = {
    df2 <- reshape2::melt(ex)
  },

  dplyr = {
    df3 <- ex %>%
      as.data.frame() %>%
      tibble::rownames_to_column("Var1") %>%
      gather("Var2", "value", -Var1)
  }
)

library(ggplot2)
autoplot(comp)

enter image description here

reshape方法目前是最快的。


你可以尝试使用 melt(as.data.table(ex)[, rn := seq_len(.N)], id.var = 'rn') - akrun
或者 dt = melt(data.table(ex, keep.rownames = TRUE) , id.vars = c("rn")) - Kipras Kančys
或者 data.frame(Var1 = rownames(ex), Var2 = colnames(ex), value = c(ex)) - markus
1个回答

4

如此众多的选择:

library(dplyr)
library(tidyr)
library(data.table)

library(microbenchmark)
library(ggplot2)

set.seed(1)
ex <- matrix(data = round(runif(100000), 1), nrow = 1000, ncol = 100)
rownames(ex) <- paste0("row", 1:nrow(ex))
colnames(ex) <- paste0("col", 1:ncol(ex))


comp <- microbenchmark(
    table = {
        df1 <- as.data.frame(as.table(ex))
    },

    reshape = {
        df2 <- reshape2::melt(ex)
    },

    dplyr = {
        df3 <- ex %>%
            as.data.frame() %>%
            tibble::rownames_to_column("Var1") %>%
            gather("Var2", "value", -Var1)
    },

    data.table = {
        dt = melt(data.table(ex, keep.rownames = TRUE) , id.vars = c("rn"))
    },

    data.table2 = {
        melt(as.data.table(ex)[, rn := seq_len(.N)], id.var = 'rn')
    },

    data.table3 = {
        data.table(Var1 = rownames(ex), Var2 = colnames(ex), value = c(ex))
    }

)

autoplot(comp)

enter image description here


感谢您的比较。data.table3 的解决方案似乎与其他解决方案给出了不同的结果。 - P. Denelle

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接