如何rbind() / dplyr::bind_rows() / data.table::rbindlist() 数据帧包含数据帧列？

Question

如何rbind() / dplyr::bind_rows() / data.table::rbindlist() 数据帧包含数据帧列？

7

基本的R语言、dplyr和data.table不能合并包含数据框列的数据框：

x <- data.frame(a=1)
x$b <- data.frame(z=2)
y <- data.frame(a=3)
y$b <- data.frame(z=4)

# base and dplyr fail
rbind(x, y)
#> Warning: non-unique value when setting 'row.names': '1'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
dplyr::bind_rows(x,y)
#> Error: Argument 2 can't be a list containing data frames

# data.table gives a result that doesn't make much sense to me
str(data.table::rbindlist(list(x,y)))
#> Warning in setDT(ans): Some columns are a multi-column type (such as a matrix
#> column): [2]. setDT will retain these columns as-is but subsequent operations
#> like grouping and joining may fail. Please consider as.data.table() instead
#> which will create a new column for each embedded column.
#> Classes 'data.table' and 'data.frame':   2 obs. of  2 variables:
#>  $ a: num  1 3
#>  $ b:'data.frame':   1 obs. of  2 variables:
#>   ..$ : num 2
#>   ..$ : num 4
#>  - attr(*, ".internal.selfref")=<externalptr>

^{这段代码是由 reprex包（v0.3.0）于2020-01-03创建的。}

我的预期输出是将数据框列绑定，从而得到类似下面的res：

res <- data.frame(a= c(1,3))
res$b <- data.frame(z = c(3,4))
res
#>   a z
#> 1 1 3
#> 2 3 4
str(res)
#> 'data.frame':    2 obs. of  2 variables:
#>  $ a: num  1 3
#>  $ b:'data.frame':   2 obs. of  1 variable:
#>   ..$ z: num  3 4

我该如何解决这个问题？

- moodymudskipper

1

请纠正我，但您希望您的期望结果为：res $ b <- data.frame（z = c（2,4））？ - Ali

2

@Ali，我希望我的编辑能够阐明我想要的东西。 - moodymudskipper

3个回答

2

问题似乎在于 bind 函数在 x/y 中处理数据框 b 的行名时出现了问题。我们可以通过重命名行来避免这个问题（见下文）。重要提示：现在 dplyr 已经能够处理这个例子了，不再需要使用解决方法。

# Setup
x <- data.frame(a=1)
x$b <- data.frame(z=2)
y <- data.frame(a=3)
y$b <- data.frame(z=4)

rbind(x, y) # still does not work
#> Warning: non-unique value when setting 'row.names': '1'
#> Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
require(dplyr)
dplyr::bind_rows(x,y) # works!!!
#>   a z
#> 1 1 2
#> 2 3 4


# Avoid conflicting row names
row.names(x)   <- seq(nrow(y)+1, nrow(y)+nrow(x))
row.names(x$b) <- seq(nrow(y)+1, nrow(y)+nrow(x))

rbind(x, y) #works now, too
#>   a z
#> 2 1 2
#> 1 3 4

^{由 reprex package (v0.3.0) 创建于2020年6月27日}

- Jan

0

为了更清晰地回答，我们可以期望bind_rows()在未来支持数据框列，但同时我们可以像Romain François建议的那样使用vctrs::vec_rbind()，详情请参见https://github.com/tidyverse/dplyr/issues/4226。

x <- data.frame(a=1)
x$b <- data.frame(z=2)
y <- data.frame(a=3)
y$b <- data.frame(z=4)

res <- vctrs::vec_rbind(x,y)

res
#>   a z
#> 1 1 2
#> 2 3 4

str(res)
#> 'data.frame':    2 obs. of  2 variables:
#>  $ a: num  1 3
#>  $ b:'data.frame':   2 obs. of  1 variable:
#>   ..$ z: num  2 4

^{本段内容于2020年1月6日由reprex package（v0.3.0）创建}

- moodymudskipper

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- moodymudskipper · Accepted Answer

我们可以分别绑定数据框的列和常规列，以下是三个类似的解决方案，它们封装了问题中提到的三个函数： 基本R：

rbind_fixed <- function(...){
  dfs <- list(...)
  # get all names of data.frame columns
  get_df_col_ind <- function(df) sapply(df, is.data.frame)
  df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
  df_col_names <- unique(do.call(c,df_col_names_list))
  # fail if these are not consistently data frames in all arguments
  for(df_col_name in df_col_names) {
    for(df in dfs){
      if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
        stop(df_col_name, "is not consistently a data frame column")
    }
  }
  # bind data frames, except for data frame columns
  dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
  res <- do.call(rbind, dfs_regular)
  # bind data frame columns separately and add them to the result
  for(df_col_name in df_col_names) {
    subdfs <- lapply(dfs, function(df) {
      if(df_col_name %in% names(df)) df[[df_col_name]] else
        data.frame(row.names = seq.int(nrow(df)))
    })
    # recursive to be robust in case of deep nested data frames 
    res[[df_col_name]] <- do.call(rbind_fixed, subdfs)
  }
  res
}
rbind_fixed(x, y)
#>   a z
#> 1 1 2
#> 2 3 4

dplyr

bind_rows_fixed <- function(...){
  # use list2() so we can use `!!!`, as we lose the "autosplice" feature of bind_rows
  dfs <- rlang::list2(...)
  # get all names of data.frame columns
  get_df_col_ind <- function(df) sapply(df, is.data.frame)
  df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
  df_col_names <- unique(do.call(c,df_col_names_list))
  # fail if these are not consistently data frames in all arguments
  for(df_col_name in df_col_names) {
    for(df in dfs){
      if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
        stop(df_col_name, "is not consistently a data frame column")
    }
  }
  # bind data frames, except for data frame columns
  dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
  res <- dplyr::bind_rows(dfs_regular)
  # bind data frame columns separately and add them to the result
  for(df_col_name in df_col_names) {
    subdfs <- lapply(dfs, function(df) {
      if(df_col_name %in% names(df)) df[[df_col_name]] else
        tibble(.rows = nrow(df))
    })

    # recursive to be robust in case of deep nested data frames 
    res[[df_col_name]] <- bind_rows_fixed(!!!subdfs)
  }
  res
}
bind_rows_fixed(x,y)
#>   a z
#> 1 1 2
#> 2 3 4

data.table

rbindlist_fixed <- function(l){
  dfs <- l
  # get all names of data.frame columns
  get_df_col_ind <- function(df) sapply(df, is.data.frame)
  df_col_names_list <- lapply(dfs, function(df) names(df[get_df_col_ind(df)]))
  df_col_names <- unique(do.call(c,df_col_names_list))
  # fail if these are not consistently data frames in all arguments
  for(df_col_name in df_col_names) {
    for(df in dfs){
      if(!is.null(df[[df_col_name]]) && !is.data.frame(df[[df_col_name]]))
        stop(df_col_name, "is not consistently a data frame column")
    }
  }
  # bind data frames, except for data frame columns
  dfs_regular <- lapply(dfs, function(df) df[setdiff(names(df), df_col_names)])
  res <- data.table::rbindlist(dfs_regular)
  # bind data frame columns separately and add them to the result
  for(df_col_name in df_col_names) {
    subdfs <- lapply(dfs, function(df) {
      if(df_col_name %in% names(df)) df[[df_col_name]] else
        data.frame(row.names = seq.int(nrow(df)))
    })
    # recursive to be robust in case of deep nested data frames 
    res[[df_col_name]] <- rbindlist_fixed(subdfs)
  }
  res
}
dt <- rbindlist_fixed(list(x,y))
dt
#>    a              b
#> 1: 1 <multi-column>
#> 2: 3 <multi-column>
str(dt)
#> Classes 'data.table' and 'data.frame':   2 obs. of  2 variables:
#>  $ a: num  1 3
#>  $ b:Classes 'data.table' and 'data.frame':  2 obs. of  1 variable:
#>   ..$ z: num  2 4
#>   ..- attr(*, ".internal.selfref")=<externalptr> 
#>  - attr(*, ".internal.selfref")=<externalptr>