在R中清理列表的列表的列表

5

我在R中遇到了多级列表的问题。这里是我的数据结构的一个小例子。

library(purrr)

> example
[[1]]
[[1]][[1]]
  id.value id.name    value
1        2     Tim -1.68956
2        4    Jack  1.23950
3        5    Mary -0.10897
4        3  Joseph -0.11724
5        1  Kermit  0.18308

[[1]][[2]]
  id.value id.name    value
1        6     Tim  0.50381
2        2    Jack  2.52834
3        1    Mary  0.54910
4        4  Joseph  0.23821
5        5  Kermit -1.04889
6        3     Red  1.29476

[[1]][[3]]
  id.value id.name    value
1        4     Tim -0.47279
2        1    Jack -1.06782
3        2    Mary -0.21797
4        3  Joseph -1.02600
5        5  Kermit -0.72889

[[1]]$main.id
[1] 123


[[2]]
[[2]][[1]]
  id.value id.name    value
1        2     Tim -1.16554
2        4    Jack -0.81852
3        1    Mary  0.68494
4        3  Joseph -0.32006
5        5  Kermit -1.31152

[[2]][[2]]
  id.value id.name     value
1        2     Tim  0.821581
2        4    Jack  0.688640
3        5    Mary  0.553918
4        3  Joseph -0.061912
5        1  Kermit -0.305963

[[2]][[3]]
  id.value id.name    value
1        2     Tim  0.80018
2        1    Jack -0.16393
3        4    Mary  1.24292
4        5  Joseph -0.93439
5        3  Kermit  0.39371

[[2]]$main.id
[1] 234

结构体在我的理解中是一个包含数据框和普通向量的列表嵌套列表。通常我会使用purrr map从中获取一些内容,但现在我无法深入到足够的层次。最终结果应该看起来像result(如果某些值不正确,请见谅,手动操作容易出错)。

> head(result, 2)
# A tibble: 2 x 5
  list.id sub.list.id id.value id.name  value
    <dbl>       <dbl>    <dbl> <chr>    <dbl>
1     123           1        2 Tim     -0.333
2     123           1        4 Jack    -1.02 

> tail(result, 2)
# A tibble: 2 x 5
  list.id sub.list.id id.value id.name value
    <dbl>       <dbl>    <dbl> <chr>   <dbl>
1     234           3        5 Joseph  0.548
2     234           3        3 Kermit  0.239

list.id = main.id,我使用map_dbl(example, c("main.id"))进行操作。

sub.list.id是该列表的最后一个列表编号。在本示例中,它从1到3为每个主列表运行。

[[1]]
[[1]]**[[1]]**
  id.value id.name      value
1        2     Tim -1.6895557
2        4    Jack  1.2394959
3        5    Mary -0.1089660
4        3  Joseph -0.1172420
5        1  Kermit  0.1830826

Other variables 应该是自解释的。
我通常使用 purrr map 处理这些复杂的列表,但如果有其他好的方法也可以尝试。我已经尝试了 unlist,但它完全破坏了结构,我认为这并不是必要的。我目前正在尝试使用 bind_cols(与 data.frame 和 vector 一起)-> 然后尝试使用带有 .id 的 bind_rows,但尚未成功地获得任何有意义的结果。
数据:
example <- list(list(structure(list(id = structure(list(value = c(2L, 4L, 
                5L, 3L, 1L), name = c("Tim", "Jack", "Mary", "Joseph", "Kermit"
                )), class = "data.frame", row.names = c(NA, 5L)), value = c(-1.6895556640288, 
                1.23949588599841, -0.108965972315484, -0.117241961787958, 0.183082613838439
                )), class = "data.frame", row.names = c(NA, 5L)), structure(list(
                    id = structure(list(value = c(6L, 2L, 1L, 4L, 5L, 3L), name = c("Tim", 
                    "Jack", "Mary", "Joseph", "Kermit", "Red")), class = "data.frame", row.names = c(NA, 
                    6L)), value = c(0.503812447155119, 2.52833655070411, 0.549096735635542, 
                    0.238212920794043, -1.04889314358654, 1.29476325458416)), class = "data.frame", row.names = c(NA, 
                6L)), structure(list(id = structure(list(value = c(4L, 1L, 2L, 
                3L, 5L), name = c("Tim", "Jack", "Mary", "Joseph", "Kermit")), class = "data.frame", row.names = c(NA, 
                5L)), value = c(-0.472791407727934, -1.06782370598685, -0.217974914658295, 
                -1.02600444830724, -0.72889122929114)), class = "data.frame", row.names = c(NA, 
                5L)), main.id = 123), list(structure(list(id = structure(list(
                    value = c(2L, 4L, 1L, 3L, 5L), name = c("Tim", "Jack", "Mary", 
                    "Joseph", "Kermit")), class = "data.frame", row.names = c(NA, 
                5L)), value = c(-1.16554484788995, -0.818515722513129, 0.684936077925063, 
                -0.320056419276819, -1.31152241139676)), class = "data.frame", row.names = c(NA, 
                5L)), structure(list(id = structure(list(value = c(2L, 4L, 5L, 
                3L, 1L), name = c("Tim", "Jack", "Mary", "Joseph", "Kermit")), class = "data.frame", row.names = c(NA, 
                5L)), value = c(0.821581081637487, 0.688640254100091, 0.553917653537589, 
                -0.0619117105767217, -0.305962663739917)), class = "data.frame", row.names = c(NA, 
                5L)), structure(list(id = structure(list(value = c(2L, 1L, 4L, 
                5L, 3L), name = c("Tim", "Jack", "Mary", "Joseph", "Kermit")), class = "data.frame", row.names = c(NA, 
                5L)), value = c(0.800176865835429, -0.163930968642975, 1.24291877493732, 
                -0.93438505805516, 0.393708652215792)), class = "data.frame", row.names = c(NA, 
                5L)), main.id = 234))

期望输出:

result <- structure(list(list.id = c(123, 123, 123, 123, 123, 123, 123, 
            123, 123, 123, 123, 123, 123, 123, 123, 123, 234, 234, 234, 234, 
            234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234), sub.list.id = c(1, 
            1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 2, 
            2, 2, 2, 2, 3, 3, 3, 3, 3), id.value = c(2, 4, 5, 3, 1, 6, 2, 
            1, 4, 5, 3, 4, 1, 2, 3, 5, 2, 4, 1, 3, 5, 2, 4, 5, 3, 1, 2, 1, 
            4, 5, 3), id.name = c("Tim", "Jack", "Mary", "Joseph", "Kermit", 
            "Tim", "Jack", "Mary", "Joseph", "Kermit", "Red", "Tim", "Jack", 
            "Mary", "Joseph", "Kermit", "Tim", "Jack", "Mary", "Joseph", 
            "Kermit", "Tim", "Jack", "Mary", "Joseph", "Kermit", "Tim", "Jack", 
            "Mary", "Joseph", "Kermit"), value = c(-0.33320738366942, -1.01857538310709, 
            -1.07179122647558, 0.303528641404258, 0.448209778629426, 0.0530042267305041, 
            0.922267467879737, 2.05008468562714, -0.491031166056535, -2.30916887564081, 
            1.00573852446226, -0.709200762582393, -0.688008616467358, 1.0255713696967, 
            -0.284773007051009, -1.22071771225454, 0.18130347974915, -0.138891362439045, 
            0.00576418589988693, 0.38528040112633, -0.370660031792409, 0.644376548518833, 
            -0.220486561818751, 0.331781963915697, 1.09683901314935, 0.435181490833803, 
            -0.325931585531227, 1.14880761845109, 0.993503855962119, 0.54839695950807, 
            0.238731735111441)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
            31L))
3个回答

2

这有点麻烦,而且tidyr::unnest似乎由于某些原因无法正常工作。尽管如此,

library(purrr)

example %>% 
    set_names(map_chr(., 'main.id')) %>%    # store IDs as names for easy recovery
    map(keep, is.data.frame) %>%    # drop now superfluous `main.id` elements
    map_dfr(    # for each sublist
        function(sublist) { 
            map_dfr(    # for each element
                sublist, 
                ~dplyr::bind_cols(set_names(.x$id, ~paste0('id.', .x)), .x[2]),    # unnest
                .id = 'sublist.id'    # simplify sublist to data frame, adding element ID column
            )
        }, 
        .id = 'list.id') %>%    # simplify list to data frame, adding element ID column
    readr::type_convert()    # fix types of data stored as names
#>    list.id sublist.id id.value id.name       value
#> 1      123          1        2     Tim -1.68955566
#> 2      123          1        4    Jack  1.23949589
#> 3      123          1        5    Mary -0.10896597
#> 4      123          1        3  Joseph -0.11724196
#> 5      123          1        1  Kermit  0.18308261
#> 6      123          2        6     Tim  0.50381245
#> 7      123          2        2    Jack  2.52833655
#> 8      123          2        1    Mary  0.54909674
#> 9      123          2        4  Joseph  0.23821292
#> 10     123          2        5  Kermit -1.04889314
#> 11     123          2        3     Red  1.29476325
#> 12     123          3        4     Tim -0.47279141
#> 13     123          3        1    Jack -1.06782371
#> 14     123          3        2    Mary -0.21797491
#> 15     123          3        3  Joseph -1.02600445
#> 16     123          3        5  Kermit -0.72889123
#> 17     234          1        2     Tim -1.16554485
#> 18     234          1        4    Jack -0.81851572
#> 19     234          1        1    Mary  0.68493608
#> 20     234          1        3  Joseph -0.32005642
#> 21     234          1        5  Kermit -1.31152241
#> 22     234          2        2     Tim  0.82158108
#> 23     234          2        4    Jack  0.68864025
#> 24     234          2        5    Mary  0.55391765
#> 25     234          2        3  Joseph -0.06191171
#> 26     234          2        1  Kermit -0.30596266
#> 27     234          3        2     Tim  0.80017687
#> 28     234          3        1    Jack -0.16393097
#> 29     234          3        4    Mary  1.24291877
#> 30     234          3        5  Joseph -0.93438506
#> 31     234          3        3  Kermit  0.39370865

我会选择这个,我有点太聪明了,需要一些工作才能将其与真实数据一起使用。原材料在这里,我会尝试自己解决它。谢谢。 - Hakki
如果这个列表来自于 jsonlite::fromJSON,不告诉它简化为数据框可能会使这种情况更简单。roomba 也很有用,但在这种情况下不会为您提取 ID。 - alistaire
是的,这是从JSON转换而来的,我试图挑选其中的一部分,认为只用这个小例子就可以搞定。但是最后一个列表中的那个可恶的数据框在我所有的尝试中都让我感到困扰。如果我一周内无法解决它,我会回来的。谢谢。 - Hakki
2
据我所知,tidyverse不支持数据框列,但包含数据框的列表列是可以的,如下代码可以正常运行:example[[1]][[1]] %>% transform(id=split(id,seq(nrow(id)))) %>% unnest。请注意,我必须使用tranform,因为mutate无法在这里使用。还要注意,as_tibble(example[[1]][[1]])会失败并显示错误信息:“列'id'必须是一维原子向量或列表”。 - moodymudskipper
@Moody_Mudskipper 啊!没错,就是这样。我真希望 jsonlite::fromJSON 只生成符合 tibble 规范的/不那么傻的数据框。 - alistaire
显示剩余2条评论

1
我的观点是使用tidyverse进行整洁的解决方案:
library(tidyverse):

map_dfr(example, 
    ~ cbind(.['main.id'],            # put main_id as our first column
             map_dfr(.[-length(.)],  # build the rest of the table from previous elements
             ~bind_cols(             # for each of them takout manually .id col and rename
               rename_all(.$id,~paste0("id.",.)),
               .['value']),
             .id = "sub.list.id")))  # the .id parameter will do the required indexing
#    main.id sub.list.id id.value id.name       value
# 1      123           1        2     Tim -1.68955566
# 2      123           1        4    Jack  1.23949589
# 3      123           1        5    Mary -0.10896597
# 4      123           1        3  Joseph -0.11724196
# 5      123           1        1  Kermit  0.18308261
# 6      123           2        6     Tim  0.50381245
# 7      123           2        2    Jack  2.52833655
# 8      123           2        1    Mary  0.54909674
# 9      123           2        4  Joseph  0.23821292
# 10     123           2        5  Kermit -1.04889314
# 11     123           2        3     Red  1.29476325
# 12     123           3        4     Tim -0.47279141
# 13     123           3        1    Jack -1.06782371
# 14     123           3        2    Mary -0.21797491
# 15     123           3        3  Joseph -1.02600445
# 16     123           3        5  Kermit -0.72889123
# 17     234           1        2     Tim -1.16554485
# 18     234           1        4    Jack -0.81851572
# 19     234           1        1    Mary  0.68493608
# 20     234           1        3  Joseph -0.32005642
# 21     234           1        5  Kermit -1.31152241
# 22     234           2        2     Tim  0.82158108
# 23     234           2        4    Jack  0.68864025
# 24     234           2        5    Mary  0.55391765
# 25     234           2        3  Joseph -0.06191171
# 26     234           2        1  Kermit -0.30596266
# 27     234           3        2     Tim  0.80017687
# 28     234           3        1    Jack -0.16393097
# 29     234           3        4    Mary  1.24291877
# 30     234           3        5  Joseph -0.93438506
# 31     234           3        3  Kermit  0.39370865

0
您可以按以下方式循环嵌套项:

list.id <- c(); sub.list.id <- c(); id.value <- c(); id.name <- c(); value <- c(); r <- 0

for (i in 1:length(example)) {

  list.id.value <- example[[i]]$main.id

  for (j in 1:(length(example[[i]])-1)) {

    sub.list.id.value <- j

    for (k in 1:nrow(example[[i]][[j]][1])) {
      r <- r + 1

      list.id[r] <- list.id.value %>% as.numeric()
      sub.list.id[r] <- sub.list.id.value %>% as.numeric()
      id.value[r] <- example[[i]][[j]][[1]][k, "value"] %>% as.numeric()
      id.name[r] <- example[[i]][[j]][[1]][k, "name"]       
      value[r] <- example[[i]][[j]][[2]][k] %>% as.numeric()

    }

  }

}

result <- data.frame(list.id, sub.list.id, id.value, id.name, value) # %>% as.tibble()
result 

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接