使用dplyr进行多变量总结

3

我有以下数据框:

head(sample_data)
  article value date
1       A 21920 2015
2       I   615 2017
3       B  1414 2018
4       D   102 2018
5       I  1096 2015
6       A  2577 2021

完整数据集

dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C", 
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B", 
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A", 
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G", 
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577, 
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386, 
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227, 
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805, 
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015", 
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017", 
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018", 
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021", 
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019", 
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020", 
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019", 
"2016")), row.names = c(NA, -50L), class = "data.frame")


        

我想使用dplyr来获得类似以下内容的东西:

sample_data %>%
+   group_by(article, date) %>% 
+   summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups:   article [9]
   article date  weight
   <chr>   <chr>  <dbl>
 1 A       2015   55572
 2 A       2016   33948
 3 A       2017   11716
 4 A       2019    8668
 5 A       2020    5386
 6 A       2021    2577
 7 B       2015   16119
 8 B       2018    1414
 9 B       2019    8137
10 B       2020    1174
# ... with 19 more rows

然而,我想添加另一列,显示每篇文章在每年总重量(A:I的总和)中所占比例。每年所有文章比例之和应为1。

我尝试了以下代码。我怀疑这是因为我使用"值"导致打印出所有值,因此会出现多次。我该如何对其进行总结,使其看起来像上面添加列后的样子?

sample_data %>%
+   group_by(article, date) %>% 
+   summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups:   article, date [29]
   article date  weight  prop
   <chr>   <chr>  <dbl> <dbl>
 1 A       2015   55572 0.394
 2 A       2015   55572 0.265
 3 A       2015   55572 0.341
 4 A       2016   33948 0.455
 5 A       2016   33948 0.545
 6 A       2017   11716 1    
 7 A       2019    8668 0.845
 8 A       2019    8668 0.155
 9 A       2020    5386 1    
10 A       2021    2577 1    
# ... with 40 more rows
2个回答

4
在最初的summarize之后,每年每篇文章都有一个条目。然后您希望了解每篇文章对每年总数的贡献,因此需要再次使用仅年份进行group_by,最后使用mutate来获取每篇文章的比例。
library(dplyr)

sample_data %>%
   group_by(article, date) %>% 
   summarise(weight = sum(value), .groups = "keep") %>%
   group_by(date) %>%
   mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups:   date [7]
#>    article date  weight  prop
#>    <chr>   <chr>  <dbl> <dbl>
#>  1 A       2015   55572 0.661
#>  2 A       2016   33948 0.876
#>  3 A       2017   11716 0.632
#>  4 A       2019    8668 0.491
#>  5 A       2020    5386 0.799
#>  6 A       2021    2577 0.628
#>  7 B       2015   16119 0.192
#>  8 B       2018    1414 0.622
#>  9 B       2019    8137 0.461
#> 10 B       2020    1174 0.174
#> # ... with 19 more rows

reprex包(v2.0.1)于2022-02-19创建


太棒了,那正是我正在寻找的。非常感谢你。 ".groups = 'keep'" 是做什么用的? - nuke
1
@nuke 我只是加上那个来消除 summarize 产生的信息。没有它,代码仍然可以正常工作。 - Allan Cameron

2
一个选项是在第一个summarise中进行分组求和。
library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
  group_by(date) %>%
  summarise(out = enframe(tapply(value, article, sum)/sum(value), 
   name = 'article', value = 'prop'), .groups = 'drop') %>% 
  unpack(out)
# A tibble: 29 × 3
   date  article    prop
   <chr> <chr>     <dbl>
 1 2015  A       0.661  
 2 2015  B       0.192  
 3 2015  D       0.0923 
 4 2015  E       0.00665
 5 2015  F       0.0263 
 6 2015  H       0.00515
 7 2015  I       0.0164 
 8 2016  A       0.876  
 9 2016  C       0.0853 
10 2016  E       0.0123 
# … with 19 more rows

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接